Merge branch 'master' into xsn/mtmd_ds_ocr_tiles

2026-06-27 23:50:20 -05:00 · 2026-06-25 16:28:50 +02:00 · 2026-06-25 16:28:50 +02:00 · 2e4cbade70
commit 2e4cbade70
parent 959ce58197 099bf06952
322 changed files with 18181 additions and 13977 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@ -13,6 +13,20 @@ ARG APP_REVISION=N/A
 # BUILD STAGE
 # Compile all binary files and libraries
 # ==============================================================================
+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 FROM ${CANN_BASE_IMAGE} AS build

 # -- Install build dependencies --
@ -26,6 +40,8 @@ WORKDIR /app
 # -- Copy project files --
 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 # -- Set CANN environment variables (required for compilation) --
 # Using ENV instead of `source` allows environment variables to persist across the entire image layer
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@ -3,6 +3,20 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 ARG TARGETARCH
@ -16,6 +30,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
    else \
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@ -11,6 +11,20 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build

 ARG GCC_VERSION
@ -26,6 +40,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@ -5,6 +5,20 @@ ARG APP_REVISION=N/A

 ## Build Image

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=ON
@ -22,6 +36,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" \
        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON" \
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@ -10,6 +10,20 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build

 # MUSA architecture to build for (defaults to all supported archs)
@ -29,6 +43,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
    fi && \
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@ -22,6 +22,20 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 ## Build Image
 FROM docker.io/ubuntu:${UBUNTU_VERSION} AS build

@ -69,6 +83,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 # Build Stage
 RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
    cmake -B build/ReleaseOV -G Ninja \
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@ -11,6 +11,20 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 ### Build image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build

@ -38,6 +52,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
    cmake -S . -B build \
        -DGGML_HIP=ON \
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@ -3,6 +3,20 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 # Install build tools
@ -17,6 +31,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
    cmake --build build --config Release -j$(nproc)

--- a/.devops/zendnn.Dockerfile
+++ b/.devops/zendnn.Dockerfile
@ -3,6 +3,20 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
@ -14,6 +28,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \
    cmake --build build -j $(nproc)

--- a/.dockerignore
+++ b/.dockerignore
@ -10,6 +10,8 @@

 build*/

+tools/ui/node_modules/
+
 models/*

 /llama-cli
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -35,8 +35,20 @@ AMD ZenDNN:
 documentation:
    - changed-files:
        - any-glob-to-any-file:
+            - **/*.md
            - docs/**
            - media/**
+examples:
+    - all:
+        - changed-files:
+            - any-glob-to-any-file:
+                - app/**
+                - examples/**
+                - tools/**
+            - all-globs-to-all-files:
+                - '!tools/server/**'
+                - '!tools/mtmd/**'
+                - '!tools/ui/**'
 testing:
    - changed-files:
        - any-glob-to-any-file:
@ -47,28 +59,12 @@ build:
            - cmake/**
            - CMakeLists.txt
            - CMakePresets.json
-examples:
-    - changed-files:
-        - any-glob-to-any-file:
-            - examples/**
-            - tools/**
 devops:
    - changed-files:
        - any-glob-to-any-file:
            - .devops/**
            - .github/**
            - ci/**
-python:
-    - changed-files:
-        - any-glob-to-any-file:
-            - "**/*.py"
-            - requirements/**
-            - gguf-py/**
-            - .flake8
-script:
-    - changed-files:
-        - any-glob-to-any-file:
-            - scripts/**
 android:
    - changed-files:
        - any-glob-to-any-file:
@ -81,9 +77,20 @@ server:
    - changed-files:
        - any-glob-to-any-file:
            - tools/server/**
-
-
-
+mtmd:
+    - changed-files:
+        - any-glob-to-any-file:
+            - tools/mtmd/**
+conversion:
+    - changed-files:
+        - any-glob-to-any-file:
+            - conversion/**
+            - convert_*.py
+            - gguf-py/**
+vendor:
+    - changed-files:
+        - any-glob-to-any-file:
+            - vendor/**
 ggml:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -58,6 +58,13 @@ jobs:
          git tag ${{ steps.srctag.outputs.name }} || exit 0
          git push origin ${{ steps.srctag.outputs.name }} || exit 0

+  build_ui:
+    name: Build UI
+    needs: create_tag
+    uses: ./.github/workflows/ui-build.yml
+    with:
+      hf_ui_version: ${{ needs.create_tag.outputs.source_tag }}
+
  prepare_matrices:
    name: Prepare Docker matrices
    runs-on: ubuntu-24.04
@ -79,7 +86,7 @@ jobs:
          [
            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
+            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x", "prebuilt_ui": true },
            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
@ -135,7 +142,7 @@ jobs:

  push_to_registry:
    name: Push Docker image to Docker Registry
-    needs: [prepare_matrices, create_tag]
+    needs: [prepare_matrices, create_tag, build_ui]

    runs-on: ${{ matrix.config.runs_on }}
    strategy:
@ -150,6 +157,13 @@ jobs:
          fetch-depth: 0
          ref: ${{ needs.create_tag.outputs.source_tag }}

+      - name: Download prebuilt UI
+        if: ${{ matrix.config.prebuilt_ui == true }}
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
+        with:
+          name: ui-build
+          path: tools/ui/dist
+
      - name: Set up QEMU
        if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
        uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -1627,6 +1627,7 @@ jobs:
            **Windows:**
            - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
            - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
+            - [Windows arm64 (OpenCL Adreno)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-opencl-adreno-arm64.zip)
            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
            - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
--- a/.pi/gg/SYSTEM.md
+++ b/.pi/gg/SYSTEM.md
@ -25,13 +25,3 @@ Commits:
 - Do not explicitly set the git author in commits - rely on the default git config
 - Always use `--no-gpg-sign` when committing
 - Never `git push` without explicit confirmation from the user
-
-Resources (read on demand):
- [CONTRIBUTING.md](CONTRIBUTING.md)
- [Build documentation](docs/build.md)
- [Server usage documentation](tools/server/README.md)
- [Server development documentation](tools/server/README-dev.md)
- [PEG parser](docs/development/parsing.md)
- [Auto parser](docs/autoparser.md)
- [Jinja engine](common/jinja/README.md)
- [PR template](.github/pull_request_template.md)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -222,6 +222,16 @@ if (LLAMA_BUILD_APP)
    add_subdirectory(app)
 endif()

+# Standalone libmtmd build without pulling in the rest of the tools/ tree.
+# Useful when packaging just the mtmd library for language bindings (e.g. an
+# Apple XCFramework, or a WASM build). When the full tools build is enabled,
+# mtmd is already built by the tools/ subdirectory above; this hook only fires
+# when LLAMA_BUILD_TOOLS is OFF to avoid double-adding the target.
+option(LLAMA_BUILD_MTMD "llama: build tools/mtmd library standalone" OFF)
+if (LLAMA_BUILD_MTMD AND NOT (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS))
+    add_subdirectory(tools/mtmd)
+endif()
+
 #
 # install
 #
--- a/2
+++ b/2
@ -10,7 +10,7 @@
 # ggml-org/ggml-rpc         : rgerganov
 # ggml-org/ggml-sycl        : arthw
 # ggml-org/ggml-vulkan      : 0cc4m, jeffbolznv
-# ggml-org/ggml-webgpu      : reeselevine
+# ggml-org/ggml-webgpu      : reeselevine, yomaytk
 # ggml-org/ggml-zdnn        : taronaeo
 # ggml-org/llama-common     : ggerganov, aldehir, angt, danbev, ngxson, pwilkin
 # ggml-org/llama-mtmd       : ngxson
--- a/README.md
+++ b/README.md
@ -142,7 +142,9 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
 - [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
 - [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
+- [x] [Liquid LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2)
+- [x] [Liquid LFM2.5 models](https://huggingface.co/collections/LiquidAI/lfm25)
+- [x] [Liquid Nanos](https://huggingface.co/collections/LiquidAI/liquid-nanos)
 - [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
 - [x] [BailingMoeV2 (Ring/Ling 2.0) models](https://huggingface.co/collections/inclusionAI/ling-v2-68bf1dd2fc34c306c1fa6f86)
 - [x] [Mellum models](https://huggingface.co/JetBrains/models?search=mellum)
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@ -1,6 +1,6 @@
 set(TARGET llama-app)

-add_executable(${TARGET} llama.cpp)
+add_executable(${TARGET} llama.cpp download.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)

 target_link_libraries(${TARGET} PRIVATE
--- a/app/download.cpp
+++ b/app/download.cpp
@ -0,0 +1,71 @@
+#include "arg.h"
+#include "common.h"
+#include "download.h"
+#include "log.h"
+
+#include <cstdio>
+#include <filesystem>
+
+static void print_usage(int /*argc*/, char ** argv) {
+    printf(
+        "\nexamples:\n"
+        "  %s -hf ggml-org/gemma-3-4b-it-qat-GGUF\n"
+        "  %s -hf ggml-org/gemma-3-4b-it-qat-GGUF:Q4_K_M\n"
+        "  %s -hf ggml-org/models -hff model.gguf\n"
+        "  %s -mu https://example.com/model.gguf -m model.gguf\n"
+        "\n",
+        argv[0], argv[0], argv[0], argv[0]
+    );
+}
+
+int llama_download(int argc, char ** argv);
+
+int llama_download(int argc, char ** argv) {
+    common_init();
+
+    common_params params;
+    params.verbosity = LOG_LEVEL_ERROR;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DOWNLOAD, print_usage)) {
+        return 1;
+    }
+
+    const bool has_source = !params.model.hf_repo.empty() || !params.model.url.empty() ||
+                            !params.model.path.empty()    || !params.model.docker_repo.empty();
+    if (!has_source) {
+        fprintf(stderr, "error: no model source specified (use --hf-repo, --model-url, --model or --docker-repo)\n");
+        return 1;
+    }
+
+    try {
+        common_models_handler handler = common_models_handler_init(params, LLAMA_EXAMPLE_DOWNLOAD);
+        common_models_handler_apply(handler, params);
+    } catch (const std::exception & e) {
+        fprintf(stderr, "error: %s\n", e.what());
+        return 1;
+    }
+
+    if (!params.models_preset.empty()) {
+        // -hf pointed at a preset repo: print the preset path and stop
+        printf("%s\n", params.models_preset.c_str());
+        return 0;
+    }
+    if (params.model.path.empty()) {
+        fprintf(stderr, "error: model download failed\n");
+        return 1;
+    }
+    if (!std::filesystem::exists(params.model.path)) {
+        fprintf(stderr, "error: model file does not exist: %s\n", params.model.path.c_str());
+        return 1;
+    }
+
+    printf("%s\n", params.model.path.c_str());
+    if (!params.mmproj.path.empty()) {
+        printf("%s\n", params.mmproj.path.c_str());
+    }
+    if (!params.speculative.draft.mparams.path.empty()) {
+        printf("%s\n", params.speculative.draft.mparams.path.c_str());
+    }
+
+    return 0;
+}
--- a/app/llama.cpp
+++ b/app/llama.cpp
@ -19,6 +19,7 @@ int llama_batched_bench(int argc, char ** argv);
 int llama_fit_params(int argc, char ** argv);
 int llama_quantize(int argc, char ** argv);
 int llama_perplexity(int argc, char ** argv);
+int llama_download(int argc, char ** argv);

 // Self-update is only supported for binaries built with llama-install.sh
 static int llama_update(int argc, char ** argv) {
@ -61,6 +62,7 @@ static const command cmds[] = {
    {"serve",         "HTTP API server",                                    {"server"},   false,         llama_server       },
    {"cli",           "Command-line interactive interface",                 {"client"},   false,         llama_cli          },
    {"update",        "Update llama to the latest release",                 {},           UPDATE_HIDDEN, llama_update       },
+    {"download",      "Download a model",                                   {"get"},      false,         llama_download     },
    {"completion",    "Text completion",                                    {"complete"}, true,          llama_completion   },
    {"bench",         "Benchmark prompt processing and text generation",    {},           true,          llama_bench        },
    {"batched-bench", "Benchmark batched decoding performance",             {},           true,          llama_batched_bench},
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@ -13,6 +13,7 @@ LLAMA_BUILD_EXAMPLES=OFF
 LLAMA_BUILD_TOOLS=OFF
 LLAMA_BUILD_TESTS=OFF
 LLAMA_BUILD_SERVER=OFF
+LLAMA_BUILD_MTMD=ON
 GGML_METAL=ON
 GGML_METAL_EMBED_LIBRARY=ON
 GGML_BLAS_DEFAULT=ON
@ -39,6 +40,7 @@ COMMON_CMAKE_ARGS=(
    -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
    -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
    -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
+    -DLLAMA_BUILD_MTMD=${LLAMA_BUILD_MTMD}
    -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
    -DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
    -DGGML_METAL=${GGML_METAL}
@ -126,6 +128,8 @@ setup_framework_structure() {
    cp ggml/include/ggml-cpu.h     ${header_path}
    cp ggml/include/ggml-blas.h    ${header_path}
    cp ggml/include/gguf.h         ${header_path}
+    cp tools/mtmd/mtmd.h           ${header_path}
+    cp tools/mtmd/mtmd-helper.h    ${header_path}

    # Create module map (common for all platforms)
    cat > ${module_path}module.modulemap << EOF
@ -247,6 +251,7 @@ combine_static_libraries() {
        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
        "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
        "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
+        "${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a"
    )

    # Create temporary directory for processing
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -80,8 +80,6 @@ add_library(${TARGET}
    http.h
    imatrix-loader.cpp
    imatrix-loader.h
-    json-partial.cpp
-    json-partial.h
    json-schema-to-grammar.cpp
    llguidance.cpp
    log.cpp
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -17,6 +17,7 @@
 #   define NOMINMAX
 #endif
 #include <windows.h>
+#include <shellapi.h>
 #endif

 #define JSON_ASSERT GGML_ASSERT
@ -296,60 +297,6 @@ struct handle_model_result {
    std::string preset_path;
 };

-static handle_model_result common_params_handle_model(struct common_params_model & model,
-                                                      const common_download_opts & opts) {
-    handle_model_result result;
-
-    if (!model.docker_repo.empty()) {
-        model.path = common_docker_resolve_model(model.docker_repo);
-        model.name = model.docker_repo;
-    } else if (!model.hf_repo.empty()) {
-        // If -m was used with -hf, treat the model "path" as the hf_file to download
-        if (model.hf_file.empty() && !model.path.empty()) {
-            model.hf_file = model.path;
-            model.path = "";
-        }
-        common_download_opts hf_opts = opts;
-        auto download_result = common_download_model(model, hf_opts);
-
-        if (!download_result.preset_path.empty()) {
-            result.found_preset = true;
-            result.preset_path = download_result.preset_path;
-            return result; // skip everything else if preset.ini is used
-        }
-
-        if (download_result.model_path.empty()) {
-            throw std::runtime_error("failed to download model from Hugging Face");
-        }
-
-        model.name = model.hf_repo;
-        model.path = download_result.model_path;
-
-        if (!download_result.mmproj_path.empty()) {
-            result.found_mmproj = true;
-            result.mmproj.path  = download_result.mmproj_path;
-        }
-
-        if (!download_result.mtp_path.empty()) {
-            result.found_mtp = true;
-            result.mtp.path  = download_result.mtp_path;
-        }
-    } else if (!model.url.empty()) {
-        if (model.path.empty()) {
-            auto f = string_split<std::string>(model.url, '#').front();
-            f = string_split<std::string>(f, '?').front();
-            model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
-        }
-
-        auto download_result = common_download_model(model, opts);
-        if (download_result.model_path.empty()) {
-            throw std::runtime_error("failed to download model from " + model.url);
-        }
-    }
-
-    return result;
-}
-
 const std::vector<ggml_type> kv_cache_types = {
    GGML_TYPE_F32,
    GGML_TYPE_F16,
@ -394,72 +341,204 @@ static bool parse_bool_value(const std::string & value) {
 }

 //
-// CLI argument parsing functions
+// common_models_handler
 //

-bool common_params_handle_models(common_params & params, llama_example curr_ex) {
-    const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
-                                         params.speculative.types.end(),
-                                         COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
+static std::string get_default_local_path(const std::string & url) {
+    auto f = string_split<std::string>(url, '#').front();
+    f = string_split<std::string>(f, '?').front();
+    return fs_get_cache_file(string_split<std::string>(f, '/').back());
+}

+common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex) {
+    common_download_hf_plan plan;
    common_download_opts opts;
+
+    const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
+                                        params.speculative.types.end(),
+                                        COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
+
+    // only download mmproj if the current example is using it
+    bool use_mmproj = false;
+    for (const auto & ex : mmproj_examples) {
+        if (curr_ex == ex) {
+            use_mmproj = true;
+            break;
+        }
+    }
+
    opts.bearer_token    = params.hf_token;
    opts.offline         = params.offline;
-    opts.skip_download   = params.skip_download;
    opts.download_mtp    = spec_type_draft_mtp;
-    opts.download_mmproj = !params.no_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty();
+    opts.download_mmproj = use_mmproj && !params.no_mmproj
+                        && params.mmproj.path.empty() && params.mmproj.url.empty();

-    // sub-models (draft, mmproj, vocoder) are explicitly specified by the user,
-    // so we should not auto-discover mtp/mmproj siblings for them
-    common_download_opts sub_opts = opts;
-    sub_opts.download_mtp    = false;
-    sub_opts.download_mmproj = false;
+    if (!params.model.hf_repo.empty()) {
+        plan = common_download_get_hf_plan(params.model, opts);
+    }

-    try {
-        auto res = common_params_handle_model(params.model, opts);
-        if (res.found_preset) {
-            if (!params.models_preset.empty()) {
-                throw std::invalid_argument("cannot use both --models-preset and -hf with a preset.ini file");
+    return common_models_handler{plan, opts};
+}
+
+bool common_models_handler_is_preset_repo(const common_models_handler & handler) {
+    return !handler.plan.preset.url.empty();
+}
+
+static std::vector<common_download_task> build_url_tasks(const common_params_model & model, common_download_opts opts) {
+    auto parts = common_download_get_all_parts(model.url);
+    std::vector<common_download_task> tasks;
+
+    // single-part: download straight to model.path if the user gave one (-m), else the cache default
+    if (parts.size() == 1) {
+        common_download_task task;
+        task.url        = parts[0];
+        task.local_path = model.path.empty() ? get_default_local_path(parts[0]) : model.path;
+        task.opts       = opts;
+        tasks.push_back(std::move(task));
+        return tasks;
+    }
+
+    // multi-part: place each part under the user's -m directory (if given), else the cache default
+    std::string base_dir;
+    if (!model.path.empty()) {
+        auto pos = model.path.rfind('/');
+        base_dir = pos == std::string::npos ? std::string(".") : model.path.substr(0, pos);
+    }
+
+    for (const auto & part : parts) {
+        common_download_task task;
+        task.url  = part;
+        task.opts = opts;
+
+        std::string local = get_default_local_path(part);
+        if (!base_dir.empty()) {
+            auto pos = local.rfind('/');
+            std::string name = pos == std::string::npos ? local : local.substr(pos + 1);
+            local = base_dir + "/" + name;
+        }
+        task.local_path = local;
+        tasks.push_back(std::move(task));
+    }
+    return tasks;
+}
+
+void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback) {
+    std::vector<common_download_task> tasks;
+
+    auto & plan = handler.plan;
+
+    auto opts = handler.opts; // copy
+    opts.callback = callback;
+
+    // handle plain "url" if needed
+    auto handle_url = [&](common_params_model & model) {
+        if (!model.url.empty()) {
+            if (model.path.empty()) {
+                model.path = get_default_local_path(model.url);
            }
+        }
+    };
+    handle_url(params.model);
+    handle_url(params.mmproj);
+    handle_url(params.vocoder.model);
+    handle_url(params.speculative.draft.mparams);
+
+    // optionally, if docker repo is set, resolve it
+    if (!params.model.docker_repo.empty()) {
+        params.model.url  = common_docker_resolve_model(params.model.docker_repo);
+        params.model.path = get_default_local_path(params.model.url);
+    }
+
+    // handle plain "url" tasks (non-hf)
+    if (!params.model.url.empty()) {
+        auto url_tasks = build_url_tasks(params.model, opts);
+        // the first part is what gets loaded, so point params.model.path at it
+        if (!url_tasks.empty()) {
+            std::string first_path = url_tasks.front().local_path;
+            url_tasks.front().on_done = [&]() { params.model.path = first_path; };
+        }
+        for (auto & task : url_tasks) {
+            tasks.push_back(std::move(task));
+        }
+    }
+    if (!params.mmproj.url.empty()) {
+        common_download_task task;
+        task.url        = params.mmproj.url;
+        task.local_path = params.mmproj.path;
+        task.opts       = opts;
+        tasks.push_back(task);
+    }
+    if (!params.vocoder.model.url.empty()) {
+        common_download_task task;
+        task.url        = params.vocoder.model.url;
+        task.local_path = params.vocoder.model.path;
+        task.opts       = opts;
+        tasks.push_back(task);
+    }
+    if (!params.speculative.draft.mparams.url.empty()) {
+        common_download_task task;
+        task.url        = params.speculative.draft.mparams.url;
+        task.local_path = params.speculative.draft.mparams.path;
+        task.opts       = opts;
+        tasks.push_back(task);
+    }
+
+    // handle hf_plan tasks
+    if (!plan.model_files.empty()) {
+        for (size_t i = 0; i < plan.model_files.size(); ++i) {
+            auto & model_file = plan.model_files[i];
+            bool is_first = (i == 0);
+            tasks.emplace_back(model_file, opts, [&, is_first]() {
+                if (is_first) {
+                    // only use first part as model path
+                    params.model.path = hf_cache::finalize_file(model_file);
+                } else {
+                    hf_cache::finalize_file(model_file);
+                }
+            });
+        }
+    }
+    if (!plan.mmproj.local_path.empty()) {
+        tasks.emplace_back(plan.mmproj, opts, [&]() {
+            params.mmproj.path = hf_cache::finalize_file(plan.mmproj);
+        });
+    }
+    if (!plan.mtp.local_path.empty()) {
+        tasks.emplace_back(plan.mtp, opts, [&]() {
+            // only fall back to the discovered MTP head when no draft was explicitly provided
+            if (params.speculative.draft.mparams.empty()) {
+                params.speculative.draft.mparams.path = hf_cache::finalize_file(plan.mtp);
+            } else {
+                hf_cache::finalize_file(plan.mtp);
+            }
+        });
+    }
+    if (!plan.preset.local_path.empty()) {
+        tasks.emplace_back(plan.preset, opts, [&]() {
            // if HF repo is a preset repo, we simply run server in router mode with the preset.ini file
            params.models_preset_hf = params.model.hf_repo; // only for showing a warning
-            params.models_preset    = res.preset_path;
+            params.models_preset    = hf_cache::finalize_file(plan.preset);
            params.model = common_params_model{}; // make sure to clear model, so server starts in router mode
-            return true;
-        }
+        });
+    }

-        if (params.no_mmproj) {
-            params.mmproj = {};
-        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
-            // optionally, handle mmproj model when -hf is specified
-            params.mmproj = res.mmproj;
-        }
-        // only download mmproj if the current example is using it
-        for (const auto & ex : mmproj_examples) {
-            if (curr_ex == ex) {
-                common_params_handle_model(params.mmproj, sub_opts);
-                break;
-            }
-        }
+    // run all tasks in parallel
+    if (!params.offline) {
+        common_download_run_tasks(tasks);
+    }

-        // when --spec-type mtp is set and no draft model was provided explicitly,
-        // fall back to the MTP head discovered alongside the -hf model
-        if (spec_type_draft_mtp && res.found_mtp &&
-            params.speculative.draft.mparams.path.empty() &&
-            params.speculative.draft.mparams.hf_repo.empty() &&
-            params.speculative.draft.mparams.url.empty()) {
-            params.speculative.draft.mparams.path = res.mtp.path;
+    // download successful, update params with the downloaded paths
+    for (const auto & task : tasks) {
+        if (task.on_done) {
+            task.on_done();
        }
-        common_params_handle_model(params.speculative.draft.mparams, sub_opts);
-        common_params_handle_model(params.vocoder.model,             sub_opts);
-        return true;
-    } catch (const common_skip_download_exception &) {
-        return false;
-    } catch (const std::exception &) {
-        throw;
    }
 }

+//
+// CLI argument parsing functions
+//
+
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
    common_params & params = ctx_arg.params;

@ -585,17 +664,22 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }

-    // export_graph_ops loads only metadata
-    const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
+    const bool skip_model_download =
+        // server will call common_params_handle_models() later, so we skip it here
+        ctx_arg.ex == LLAMA_EXAMPLE_SERVER ||
+        // download calls common_params_handle_models() itself and prints the paths
+        ctx_arg.ex == LLAMA_EXAMPLE_DOWNLOAD ||
+        // export_graph_ops loads only metadata
+        ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;

    if (!skip_model_download) {
        // handle model and download
-        common_params_handle_models(params, ctx_arg.ex);
+        common_models_handler handler = common_models_handler_init(params, ctx_arg.ex);
+        common_models_handler_apply(handler, params);

        // model is required (except for server)
        // TODO @ngxson : maybe show a list of available models in CLI in this case
        if (params.model.path.empty()
-                && ctx_arg.ex != LLAMA_EXAMPLE_SERVER
                && !params.usage
                && !params.completion) {
            throw std::invalid_argument("error: --model is required\n");
@ -663,15 +747,19 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
            common_options.push_back(&opt);
        }
    }
-    printf("----- common params -----\n\n");
-    print_options(common_options);
-    printf("\n\n----- sampling params -----\n\n");
-    print_options(sampling_options);
-    printf("\n\n----- speculative params -----\n\n");
-    print_options(spec_options);
-    // TODO: maybe convert enum llama_example to string
-    printf("\n\n----- example-specific params -----\n\n");
-    print_options(specific_options);
+    bool first = true;
+    auto print_section = [&](const char * header, std::vector<common_arg *> & options) {
+        if (options.empty()) {
+            return;
+        }
+        printf("%s----- %s -----\n\n", first ? "" : "\n\n", header);
+        first = false;
+        print_options(options);
+    };
+    print_section("common params",           common_options);
+    print_section("sampling params",         sampling_options);
+    print_section("speculative params",      spec_options);
+    print_section("example-specific params", specific_options);
 }

 static void common_params_print_completion(common_params_context & ctx_arg) {
@ -893,7 +981,44 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
    return true;
 }

+#ifdef _WIN32
+struct utf8_argv {
+    std::vector<std::string> buf;
+    std::vector<char*> ptrs;
+};
+
+static utf8_argv make_utf8_argv() {
+    utf8_argv out;
+    int wargc = 0;
+    LPWSTR* wargv = CommandLineToArgvW(GetCommandLineW(), &wargc);
+    if (!wargv) return out;
+
+    out.buf.reserve(wargc);
+    for (int i = 0; i < wargc; ++i) {
+        int n = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, wargv[i], -1, nullptr, 0, nullptr, nullptr);
+        if (n <= 0) { out.buf.emplace_back(); continue; }
+        auto& s = out.buf.emplace_back();
+        s.resize(static_cast<size_t>(n - 1));
+        (void)WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, s.data(), n, nullptr, nullptr);
+    }
+    LocalFree(wargv);
+
+    out.ptrs.reserve(out.buf.size() + 1);
+    for (auto& s : out.buf) out.ptrs.push_back(s.data());
+    out.ptrs.push_back(nullptr);
+    return out;
+}
+#endif
+
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+#ifdef _WIN32
+    auto utf8 = make_utf8_argv();
+    // repair argv only when it matches the process command line
+    if (static_cast<int>(utf8.buf.size()) == argc) {
+        argv = utf8.ptrs.data();
+    }
+#endif
+
    auto ctx_arg = common_params_parser_init(params, ex, print_usage);
    const common_params params_org = ctx_arg.params; // the example can modify the default params

@ -1034,7 +1159,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
     */
    auto add_opt = [&](common_arg arg) {
-        if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
+        // download only exposes the handful of args explicitly tagged for it
+        const bool inherit_common = ex != LLAMA_EXAMPLE_DOWNLOAD;
+        if ((arg.in_example(ex) || (inherit_common && arg.in_example(LLAMA_EXAMPLE_COMMON))) && !arg.is_exclude(ex)) {
            ctx_arg.options.push_back(std::move(arg));
        }
    };
@ -1045,7 +1172,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.usage = true;
        }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}));
    add_opt(common_arg(
        {"--version"},
        "show version and build info",
@ -2167,7 +2294,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, bool value) {
            params.no_mmproj = !value;
        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
+    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MMPROJ_AUTO"));
    add_opt(common_arg(
        {"--mmproj-offload"},
        {"--no-mmproj-offload"},
@ -2566,14 +2693,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.path = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL"));
    add_opt(common_arg(
        {"-mu", "--model-url"}, "MODEL_URL",
        "model download url (default: unused)",
        [](common_params & params, const std::string & value) {
            params.model.url = value;
        }
-    ).set_env("LLAMA_ARG_MODEL_URL"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL_URL"));
    add_opt(common_arg(
        { "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
        "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
@ -2582,7 +2709,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.docker_repo = value;
        }
-    ).set_env("LLAMA_ARG_DOCKER_REPO"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_DOCKER_REPO"));
    add_opt(common_arg(
        {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
        "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
@ -2592,14 +2719,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.hf_repo = value;
        }
-    ).set_env("LLAMA_ARG_HF_REPO"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_REPO"));
    add_opt(common_arg(
        {"-hff", "--hf-file"}, "FILE",
        "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
        [](common_params & params, const std::string & value) {
            params.model.hf_file = value;
        }
-    ).set_env("LLAMA_ARG_HF_FILE"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_FILE"));
    add_opt(common_arg(
        {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
        "Hugging Face model repository for the vocoder model (default: unused)",
@ -2620,7 +2747,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.hf_token = value;
        }
-    ).set_env("HF_TOKEN"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("HF_TOKEN"));
+    add_opt(common_arg(
+        {"--mtp"},
+        "also download the multi-token prediction (MTP) head, if available (default: unused)",
+        [](common_params & params) {
+            params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_DRAFT_MTP);
+        }
+    ).set_examples({LLAMA_EXAMPLE_DOWNLOAD}));
    add_opt(common_arg(
        {"--context-file"}, "FNAME",
        "file to load context from (use comma-separated values to specify multiple files)",
@ -2830,62 +2964,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.api_prefix = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
-    // Deprecated: use --ui-config instead (kept for backward compat)
    add_opt(common_arg(
-        {"--webui-config"}, "JSON",
-        "[DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)",
-        [](common_params & params, const std::string & value) {
-            params.ui_config_json = value;
-            params.webui_config_json = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
-
-    add_opt(common_arg(
-        {"--ui-config"}, "JSON",
+        {"--ui-config", "--webui-config"}, "JSON",
        "JSON that provides default UI settings (overrides UI defaults)",
        [](common_params & params, const std::string & value) {
            params.ui_config_json = value;
-            params.webui_config_json = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG"));
-
-    // Deprecated: use --ui-config-file instead (kept for backward compat)
    add_opt(common_arg(
-        {"--webui-config-file"}, "PATH",
-        "[DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)",
-        [](common_params & params, const std::string & value) {
-            params.ui_config_json = read_file(value);
-            params.webui_config_json = params.ui_config_json;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
-
-    add_opt(common_arg(
-        {"--ui-config-file"}, "PATH",
+        {"--ui-config-file", "--webui-config-file"}, "PATH",
        "JSON file that provides default UI settings (overrides UI defaults)",
        [](common_params & params, const std::string & value) {
            params.ui_config_json = read_file(value);
-            params.webui_config_json = params.ui_config_json;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG_FILE"));
-
-    // Deprecated: use --ui-mcp-proxy instead (kept for backward compat)
    add_opt(common_arg(
-        {"--webui-mcp-proxy"},
-        {"--no-webui-mcp-proxy"},
-        "[DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy",
-        [](common_params & params, bool value) {
-            params.ui_mcp_proxy = value;
-            params.webui_mcp_proxy = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
-
-    add_opt(common_arg(
-        {"--ui-mcp-proxy"},
-        {"--no-ui-mcp-proxy"},
+        {"--ui-mcp-proxy", "--webui-mcp-proxy"},
+        {"--no-ui-mcp-proxy", "--no-webui-mcp-proxy"},
        "experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)",
        [](common_params & params, bool value) {
            params.ui_mcp_proxy = value;
-            params.webui_mcp_proxy = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_MCP_PROXY"));
    add_opt(common_arg(
@ -2897,24 +2995,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.server_tools = parse_csv_row(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
-    // Deprecated: use --ui/--no-ui instead (kept for backward compat)
    add_opt(common_arg(
-        {"--webui"},
-        {"--no-webui"},
-        "[DEPRECATED: use --ui/--no-ui] whether to enable the Web UI",
+        {"-ag", "--agent"},
+        {"-no-ag", "--no-agent"},
+        "whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)",
        [](common_params & params, bool value) {
-            params.ui = value;
-            params.webui = value;
+            if (value) {
+                params.server_tools = {"all"};
+                params.ui_mcp_proxy = true;
+            } else {
+                params.server_tools.clear();
+                params.ui_mcp_proxy = false;
+            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
-
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_AGENT"));
    add_opt(common_arg(
-        {"--ui"},
-        {"--no-ui"},
+        {"--ui", "--webui"},
+        {"--no-ui", "--no-webui"},
        string_format("whether to enable the Web UI (default: %s)", params.ui ? "enabled" : "disabled"),
        [](common_params & params, bool value) {
            params.ui = value;
-            params.webui = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI"));
    add_opt(common_arg(
@ -2945,7 +3045,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
    add_opt(common_arg(
        {"--api-key-file"}, "FNAME",
-        "path to file containing API keys (default: none)",
+        "path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)",
        [](common_params & params, const std::string & value) {
            std::ifstream key_file(value);
            if (!key_file) {
@ -2953,7 +3053,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
            std::string key;
            while (std::getline(key_file, key)) {
-                if (!key.empty()) {
+                if (!key.empty() && key[0] != '#') {
                    params.api_keys.push_back(key);
                }
            }
--- a/common/arg.h
+++ b/common/arg.h
@ -1,12 +1,14 @@
 #pragma once

 #include "common.h"
+#include "download.h"

 #include <set>
 #include <map>
 #include <string>
 #include <vector>
 #include <cstring>
+#include <memory>

 // pseudo-env variable to identify preset-only arguments
 #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
@ -129,11 +131,19 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
 // see: https://github.com/ggml-org/llama.cpp/issues/18163
 void common_params_add_preset_options(std::vector<common_arg> & args);

-// populate model paths (main model, mmproj, etc) from -hf if necessary
-// return true if the model is ready to use
-// throw an exception if there is an error that prevents the model from being used (e.g. network error, model not found, etc)
-// if params.skip_download is true, no downloads will be attempted. return false if the model is invalid or missing (e.g. ETag check failed)
-bool common_params_handle_models(common_params & params, llama_example curr_ex);
+struct common_models_handler {
+    common_download_hf_plan plan;
+    common_download_opts opts;
+};
+
+// initialize downloading opts and hf_plan if needed, but does not download anything yet
+common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex);
+
+// check if the model is a preset repo (i.e. has a preset file)
+bool common_models_handler_is_preset_repo(const common_models_handler & handler);
+
+// download and update params with the downloaded model path
+void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback = nullptr);

 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@ -395,10 +395,11 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
                                           arguments.name_suffix) +
                           arguments.value_prefix +
                           (schema_info.resolves_to_string(param_schema) ?
-                                p.tool_arg_string_value(until_suffix) :
-                                p.tool_arg_json_value(p.schema(
-                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false))) +
-                           p.tool_arg_close(p.literal(arguments.value_suffix)));
+                                p.ac(p.tool_arg_string_value(until_suffix) +
+                                    p.tool_arg_close(p.literal(arguments.value_suffix)), arguments.value_suffix) :
+                                (p.tool_arg_json_value(p.schema(
+                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
+                                    p.tool_arg_close(p.literal(arguments.value_suffix)))));

            auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
            if (is_required) {
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -90,41 +90,93 @@ std::string common_chat_msg::render_content(const std::string & delimiter) const
    return text;
 }

-std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims) {
-    if (delims.empty() || prompt.empty()) {
-        return {};
+common_chat_role common_chat_role_from_string(const std::string & role) {
+    if (role == "system")    { return COMMON_CHAT_ROLE_SYSTEM;    }
+    if (role == "assistant") { return COMMON_CHAT_ROLE_ASSISTANT; }
+    if (role == "user")      { return COMMON_CHAT_ROLE_USER;      }
+    if (role == "tool")      { return COMMON_CHAT_ROLE_TOOL;      }
+    return COMMON_CHAT_ROLE_UNKNOWN;
+}
+
+const char * common_chat_role_to_string(common_chat_role role) {
+    switch (role) {
+        case COMMON_CHAT_ROLE_SYSTEM:    return "system";
+        case COMMON_CHAT_ROLE_ASSISTANT: return "assistant";
+        case COMMON_CHAT_ROLE_USER:      return "user";
+        case COMMON_CHAT_ROLE_TOOL:      return "tool";
+        case COMMON_CHAT_ROLE_UNKNOWN:   return "";
+    }
+    return "";
+}
+
+json common_chat_msg_delimiters::to_json() const {
+    json result = json::array();
+    for (const auto & d : delimiters) {
+        result.push_back({
+            { "role",      common_chat_role_to_string(d.role) },
+            { "delimiter", d.delimiter                        },
+        });
+    }
+    return result;
+}
+
+common_chat_msg_delimiters common_chat_msg_delimiters_parse(const json & delimiters) {
+    common_chat_msg_delimiters result;
+
+    if (!delimiters.is_array()) {
+        return result;
    }

-    auto parser = build_peg_parser([&](common_peg_parser_builder & p) {
-        std::vector<std::string>       all_delims;
-        std::vector<common_peg_parser> tagged_messages;
-
-        all_delims.reserve(delims.size());
-        tagged_messages.reserve(delims.size());
-        for (const auto & d : delims) {
-            all_delims.push_back(d.delimiter);
+    result.delimiters.reserve(delimiters.size());
+    for (const auto & d : delimiters) {
+        if (!d.is_object()) {
+            continue;
        }
-
-        auto any_delim = p.until_one_of(all_delims);
-        for (const auto & d : delims) {
-            tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim));
-        }
-
-        return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end();
-    });
-
-    common_peg_parse_context ctx(prompt);
-    const auto result = parser.parse(ctx);
-    if (!result.success()) {
-        return {};
+        result.delimiters.push_back({
+            common_chat_role_from_string(d.value("role", std::string())),
+            d.value("delimiter", std::string()),
+        });
    }

-    std::vector<common_chat_msg_span> spans;
-    ctx.ast.visit(result, [&](const common_peg_ast_node & node) {
-        if (!node.tag.empty()) {
-            spans.push_back({ node.tag, node.start, node.end - node.start });
+    return result;
+}
+
+void common_chat_msg_delimiters::tokenize(const llama_vocab * vocab) {
+    for (auto & d : delimiters) {
+        d.tokens = common_tokenize(vocab, d.delimiter, false, true);
+    }
+}
+
+common_chat_msg_spans common_chat_msg_delimiters::split(const llama_tokens & tokens, const std::map<size_t, size_t> & skips) const {
+    std::vector<std::pair<common_chat_role, size_t>> matches;
+
+    auto skip = skips.begin();
+    for (size_t i = 0; i < tokens.size();) {
+        if (skip != skips.end() && i == skip->first) {
+            i += skip->second;
+            ++skip;
+            continue;
        }
-    });
+        for (const auto & d : delimiters) {
+            if (i + d.tokens.size() > tokens.size()) {
+                continue;
+            }
+            if (std::equal(d.tokens.begin(), d.tokens.end(), tokens.begin() + i)) {
+                matches.emplace_back(d.role, i);
+                break;
+            }
+        }
+        i++;
+    }
+
+    matches.emplace_back(COMMON_CHAT_ROLE_UNKNOWN, tokens.size());
+
+    common_chat_msg_spans spans;
+    for (size_t i = 0; i + 1 < matches.size(); i++) {
+        const auto & curr = matches[i];
+        const auto & next = matches[i + 1];
+        spans.add(curr.first, curr.second, next.second - curr.second);
+    }

    return spans;
 }
@ -1081,13 +1133,13 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp

    data.prompt            = prompt;
    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
-    data.message_spans = common_chat_split_by_role(prompt, {
-        { "assistant", "<|start|>assistant" },
-        { "user",      "<|start|>user"      },
-        { "system",    "<|start|>developer" },
-        { "system",    "<|start|>system"    },
-        { "tool",      "<|start|>functions" },
-    });
+    data.message_delimiters = {
+        { COMMON_CHAT_ROLE_ASSISTANT, "<|start|>assistant" },
+        { COMMON_CHAT_ROLE_USER,      "<|start|>user"      },
+        { COMMON_CHAT_ROLE_SYSTEM,    "<|start|>developer" },
+        { COMMON_CHAT_ROLE_SYSTEM,    "<|start|>system"    },
+        { COMMON_CHAT_ROLE_TOOL,      "<|start|>functions" },
+    };

    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;
@ -1228,10 +1280,10 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
        data.prompt += data.generation_prompt;
    }

-    data.message_spans = common_chat_split_by_role(data.prompt, {
-        { "user",      "<|turn>user\n"  },
-        { "assistant", "<|turn>model\n" },
-    });
+    data.message_delimiters = {
+        { COMMON_CHAT_ROLE_USER,      "<|turn>user"  },
+        { COMMON_CHAT_ROLE_ASSISTANT, "<|turn>model" },
+    };

    data.format            = COMMON_CHAT_FORMAT_PEG_GEMMA4;
    data.supports_thinking  = true;
@ -2030,15 +2082,15 @@ static common_chat_params common_chat_params_init_cohere2moe(const common_chat_t
        RESULT_START, RESULT_END,
    };

-    // Split the rendered prompt into per-role message spans. Tool results are rendered with the
+    // Declare per-role message delimiters. Tool results are rendered with the
    // system token followed by <|START_TOOL_RESULT|>, so the "tool" delimiter must be listed before
    // the plain "system" one (it is a strict superset, and the role split tries delimiters in order).
-    data.message_spans = common_chat_split_by_role(data.prompt, {
-        { "assistant", GEN_PREFIX },
-        { "user",      TURN_START + USER },
-        { "tool",      TURN_START + SYSTEM + RESULT_START },
-        { "system",    TURN_START + SYSTEM },
-    });
+    data.message_delimiters = {
+        { COMMON_CHAT_ROLE_ASSISTANT, GEN_PREFIX },
+        { COMMON_CHAT_ROLE_USER,      TURN_START + USER },
+        { COMMON_CHAT_ROLE_TOOL,      TURN_START + SYSTEM + RESULT_START },
+        { COMMON_CHAT_ROLE_SYSTEM,    TURN_START + SYSTEM },
+    };

    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
@ -2526,17 +2578,15 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        autoparser.analyze_template(tmpl);
        auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);

-        std::vector<common_chat_msg_delimiter> delimiters;
+        common_chat_msg_delimiters delimiters;
        if (!autoparser.assistant_start.empty()) {
-            delimiters.push_back({ "assistant", autoparser.assistant_start });
+            delimiters.add(COMMON_CHAT_ROLE_ASSISTANT, autoparser.assistant_start);
        }
        if (!autoparser.user_start.empty()) {
-            delimiters.push_back({ "user", autoparser.user_start });
+            delimiters.add(COMMON_CHAT_ROLE_USER, autoparser.user_start);
        }

-        if (!delimiters.empty()) {
-            auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters);
-        }
+        auto_params.message_delimiters = std::move(delimiters);

        auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
        if (auto_params.supports_thinking) {
@ -2708,5 +2758,9 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
 std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates) {
    GGML_ASSERT(chat_templates != nullptr);
    GGML_ASSERT(chat_templates->template_default != nullptr);
+    if (chat_templates->template_tool_use != nullptr) {
+        // take the more expressive template when available
+        return chat_templates->template_tool_use->caps.to_map();
+    }
    return chat_templates->template_default->caps.to_map();
 }
--- a/common/chat.h
+++ b/common/chat.h
@ -143,15 +143,75 @@ struct common_chat_msg_diff {
    }
 };

+enum common_chat_role {
+    COMMON_CHAT_ROLE_UNKNOWN,
+    COMMON_CHAT_ROLE_SYSTEM,
+    COMMON_CHAT_ROLE_ASSISTANT,
+    COMMON_CHAT_ROLE_USER,
+    COMMON_CHAT_ROLE_TOOL
+};
+
+common_chat_role common_chat_role_from_string(const std::string & role);
+const char *     common_chat_role_to_string(common_chat_role role);
+
 struct common_chat_msg_span {
-    std::string role;
+    common_chat_role role = COMMON_CHAT_ROLE_UNKNOWN;
    std::size_t pos = 0;
    std::size_t len = 0;
+
+    bool valid() const {
+        return role != COMMON_CHAT_ROLE_UNKNOWN;
+    }
+};
+
+struct common_chat_msg_spans {
+    std::vector<common_chat_msg_span> spans;
+
+    void add(common_chat_role role, size_t pos, size_t len) {
+        spans.push_back({ role, pos, len });
+    }
+
+    bool is_user_start(int32_t pos) const {
+        for (auto it = spans.begin(); it != spans.end(); ++it) {
+            if (it->role == COMMON_CHAT_ROLE_USER && pos == (int32_t) it->pos) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    int32_t last_user_message_pos() const {
+        for (auto it = spans.rbegin(); it != spans.rend(); ++it) {
+            if (it->role == COMMON_CHAT_ROLE_USER) {
+                return (int32_t) it->pos;
+            }
+        }
+        return -1;
+    }
 };

 struct common_chat_msg_delimiter {
-    std::string role;
-    std::string delimiter;
+    common_chat_role role = COMMON_CHAT_ROLE_UNKNOWN;
+    std::string      delimiter;
+    llama_tokens     tokens = {};
+};
+
+struct common_chat_msg_delimiters {
+    std::vector<common_chat_msg_delimiter> delimiters;
+
+    common_chat_msg_delimiters() = default;
+    common_chat_msg_delimiters(std::initializer_list<common_chat_msg_delimiter> delims) : delimiters(delims) {}
+
+    void add(common_chat_role role, const std::string & delimiter) {
+        delimiters.push_back({ role, delimiter });
+    }
+
+    void tokenize(const llama_vocab * vocab);
+
+    // split tokens into message spans. skips maps a start index to a length of a region to jump over without matching
+    common_chat_msg_spans split(const llama_tokens & tokens, const std::map<size_t, size_t> & skips = {}) const;
+
+    nlohmann::ordered_json to_json() const;
 };

 struct common_chat_tool {
@ -219,7 +279,7 @@ struct common_chat_params {
    std::vector<std::string>            preserved_tokens;
    std::vector<std::string>            additional_stops;
    std::string                         parser;
-    std::vector<common_chat_msg_span>   message_spans;
+    common_chat_msg_delimiters          message_delimiters;
 };

 // per-message parsing syntax
@ -325,5 +385,4 @@ struct common_chat_prompt_preset {

 common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);

-std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims);
-
+common_chat_msg_delimiters common_chat_msg_delimiters_parse(const nlohmann::ordered_json & delimiters);
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1074,6 +1074,18 @@ std::vector<common_file_info> fs_list(const std::string & path, bool include_dir
    return files;
 }

+std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode) {
+#ifdef _WIN32
+    int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
+    if (!wlen) { return std::ifstream(); }
+    std::vector<wchar_t> wfname(wlen);
+    (void)MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wfname.data(), wlen);
+    return std::ifstream(wfname.data(), mode);
+#else
+    return std::ifstream(fname, mode);
+#endif
+}
+
 //
 // TTY utils
 //
@ -2034,7 +2046,7 @@ bool common_prompt_batch_decode(
 }

 size_t common_prompt_checkpoint::size() const {
-    return data_tgt.size() + data_dft.size();
+    return data_tgt.size() + data_dft.size() + data_spec.size();
 }

 bool common_prompt_checkpoint::empty() const {
@ -2049,6 +2061,7 @@ void common_prompt_checkpoint::clear() {

    data_tgt.clear();
    data_dft.clear();
+    data_spec.clear();
 }

 void common_prompt_checkpoint::update_pos(
@ -2138,4 +2151,5 @@ void common_prompt_checkpoint::clear_tgt() {

 void common_prompt_checkpoint::clear_dft() {
    data_dft.clear();
+    data_spec.clear();
 }
--- a/common/common.h
+++ b/common/common.h
@ -96,6 +96,7 @@ enum llama_example {
    LLAMA_EXAMPLE_FIT_PARAMS,
    LLAMA_EXAMPLE_RESULTS,
    LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
+    LLAMA_EXAMPLE_DOWNLOAD,

    LLAMA_EXAMPLE_COUNT,
 };
@ -290,12 +291,25 @@ struct common_params_sampling {
 };

 struct common_params_model {
-    std::string path        = ""; // model local path                                       // NOLINT
-    std::string url         = ""; // model url to download                                  // NOLINT
-    std::string hf_repo     = ""; // HF repo                                                // NOLINT
-    std::string hf_file     = ""; // HF file                                                // NOLINT
-    std::string docker_repo = ""; // Docker repo                                            // NOLINT
-    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
+    std::string path        = ""; // model local path
+    std::string url         = ""; // model url to download
+    std::string hf_repo     = ""; // HF repo
+    std::string hf_file     = ""; // HF file
+    std::string docker_repo = ""; // Docker repo
+
+    std::string get_name() const {
+        if (!hf_repo.empty()) {
+            return hf_repo;
+        }
+        if (!docker_repo.empty()) {
+            return docker_repo;
+        }
+        return path;
+    }
+
+    bool empty() const {
+        return get_name().empty();
+    }
 };

 // draft-model-based speculative decoding parameters
@ -358,12 +372,12 @@ struct common_params_speculative {
    common_params_speculative_ngram_cache ngram_cache;

    bool has_dft() const {
-        return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
+        return !draft.mparams.empty();
    }

    uint32_t need_n_rs_seq() const {
        bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
-            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP;
+            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3;
        });

        return needs_rs_seq ? draft.n_max : 0u;
@ -510,7 +524,6 @@ struct common_params {
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector
    bool    offline                    = false;
-    bool    skip_download              = false; // skip model file downloading

    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@ -600,7 +613,7 @@ struct common_params {
    bool    cache_prompt        = true;  // whether to enable prompt caching
    bool    cache_idle_slots    = true;  // save and clear idle slots upon starting a new task
    int32_t n_ctx_checkpoints   = 32;    // max number of context checkpoints per slot
-    int32_t checkpoint_min_step = 256;   // minimum spacing between context checkpoints
+    int32_t checkpoint_min_step = 8192;  // minimum spacing between context checkpoints
    int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.

    std::string hostname      = "127.0.0.1";
@ -624,12 +637,6 @@ struct common_params {

    // UI configs
    bool ui = true;
-
-    // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
-    bool webui = ui;
-    bool webui_mcp_proxy = false;
-    std::string webui_config_json;
-
    bool ui_mcp_proxy = false;
    std::string ui_config_json;

@ -848,6 +855,9 @@ struct common_file_info {
 };
 std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);

+// fs open, also handle UTF8 on Windows
+std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode);
+
 //
 // TTY utils
 //
@ -1065,6 +1075,10 @@ struct common_prompt_checkpoint {
    std::vector<uint8_t> data_tgt;
    std::vector<uint8_t> data_dft;

+    // (optional) speculative-decoding implementation state stashed with the checkpoint
+    // (e.g. eagle3's deferred-boundary g_embd row)
+    std::vector<uint8_t> data_spec;
+
    size_t size() const;

    bool empty() const;
--- a/common/download.cpp
+++ b/common/download.cpp
@ -292,10 +292,6 @@ static int common_download_file_single_online(const std::string & url,

    const bool file_exists = std::filesystem::exists(path);

-    if (!file_exists && opts.skip_download) {
-        return -2; // file is missing and download is disabled
-    }
-
    if (file_exists && skip_etag) {
        LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
        return 304; // 304 Not Modified - fake cached response
@ -362,9 +358,6 @@ static int common_download_file_single_online(const std::string & url,
            return 304; // 304 Not Modified - fake cached response
        }
        // pass this point, the file exists but is different from the server version, so we need to redownload it
-        if (opts.skip_download) {
-            return -2; // special code to indicate that the download was skipped due to etag mismatch
-        }
        if (remove(path.c_str()) != 0) {
            LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
            return -1;
@ -691,19 +684,8 @@ static void list_available_gguf_files(const hf_cache::hf_files & files) {
    }
 }

-struct hf_plan {
-    hf_cache::hf_file primary;
-    hf_cache::hf_files model_files;
-    hf_cache::hf_file mmproj;
-    hf_cache::hf_file mtp;
-    hf_cache::hf_file preset; // if set, only this file is downloaded
-};
-
-static hf_plan get_hf_plan(const common_params_model  & model,
-                           const common_download_opts & opts,
-                           bool download_mmproj,
-                           bool download_mtp) {
-    hf_plan plan;
+common_download_hf_plan common_download_get_hf_plan(const common_params_model & model, const common_download_opts & opts) {
+    common_download_hf_plan plan;
    hf_cache::hf_files all;

    auto [repo, tag] = common_download_split_repo_tag(model.hf_repo);
@ -752,125 +734,49 @@ static hf_plan get_hf_plan(const common_params_model  & model,
    plan.primary = primary;
    plan.model_files = get_split_files(all, primary);

-    if (download_mmproj) {
+    if (opts.download_mmproj) {
        plan.mmproj = find_best_mmproj(all, primary.path);
    }
-
-    if (download_mtp) {
+    if (opts.download_mtp) {
        plan.mtp = find_best_mtp(all, primary.path);
    }

    return plan;
 }

-struct download_task {
-    std::string url;
-    std::string path;
-};
-
-static std::vector<download_task> get_url_tasks(const common_params_model & model) {
-    auto split = get_gguf_split_info(model.url);
-
-    if (split.count <= 1) {
-        return {{model.url, model.path}};
-    }
-
-    auto filename = split.prefix;
-    if (auto pos = split.prefix.rfind('/'); pos != std::string::npos) {
-        filename = split.prefix.substr(pos + 1);
-    }
-
-    auto parent_path = std::filesystem::path(model.path).parent_path();
-    auto prefix_path = (parent_path / filename).string();
-
-    std::vector<download_task> tasks;
-    for (int i = 1; i <= split.count; i++) {
-        auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
-        tasks.push_back({split.prefix + suffix, prefix_path + suffix});
-    }
-    return tasks;
-}
-
-common_download_model_result common_download_model(const common_params_model  & model,
-                                                   const common_download_opts & opts) {
-    common_download_model_result result;
-    std::vector<download_task> tasks;
-    hf_plan hf;
-
-    bool download_mmproj = opts.download_mmproj;
-    bool download_mtp = opts.download_mtp;
-    bool is_hf = !model.hf_repo.empty();
-
-    if (is_hf) {
-        hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
-        if (!hf.preset.path.empty()) {
-            // if preset.ini exists, only download that file alone
-            tasks.push_back({hf.preset.url, hf.preset.local_path});
-        } else {
-            for (const auto & f : hf.model_files) {
-                tasks.push_back({f.url, f.local_path});
-            }
-            if (!hf.mmproj.path.empty()) {
-                tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
-            }
-            if (!hf.mtp.path.empty()) {
-                tasks.push_back({hf.mtp.url, hf.mtp.local_path});
-            }
-        }
-    } else if (!model.url.empty()) {
-        tasks = get_url_tasks(model);
-    } else {
-        result.model_path = model.path;
-        return result;
-    }
-
-    if (tasks.empty()) {
-        return result;
-    }
-
+void common_download_run_tasks(const std::vector<common_download_task> & tasks) {
    std::vector<std::future<int>> futures;
    for (const auto & task : tasks) {
        futures.push_back(std::async(std::launch::async,
-            [&task, &opts, is_hf]() {
-                return common_download_file_single(task.url, task.path, opts, is_hf);
+            [&task]() {
+                return common_download_file_single(task.url, task.local_path, task.opts, task.is_hf);
            }
        ));
    }

-    for (auto & f : futures) {
-        int status = f.get();
-        if (status == -2 && opts.skip_download) {
-            throw common_skip_download_exception();
-        }
+    for (size_t i = 0; i < futures.size(); ++i) {
+        std::string url = tasks[i].url;
+        int status = futures[i].get();
        bool is_ok = is_http_status_ok(status);
        if (!is_ok) {
-            return {};
+            throw std::runtime_error(string_format("Download '%s' failed with status code: %d", url.c_str(), status));
        }
    }
+}

-    if (is_hf) {
-        if (!hf.preset.path.empty()) {
-            // if preset.ini is used, do not set other paths
-            result.preset_path = hf_cache::finalize_file(hf.preset);
-        } else {
-            for (const auto & f : hf.model_files) {
-                hf_cache::finalize_file(f);
-            }
-            result.model_path = hf.primary.final_path;
+std::vector<std::string> common_download_get_all_parts(const std::string & url) {
+    auto split = get_gguf_split_info(url);

-            if (!hf.mmproj.path.empty()) {
-                result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
-            }
-
-            if (!hf.mtp.path.empty()) {
-                result.mtp_path = hf_cache::finalize_file(hf.mtp);
-            }
-        }
-    } else {
-        result.model_path = model.path;
+    if (split.count <= 1) {
+        return {url};
    }

-    return result;
+    std::vector<std::string> parts;
+    for (int i = 1; i <= split.count; i++) {
+        auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
+        parts.push_back(split.prefix + suffix);
+    }
+    return parts;
 }

 //
--- a/common/download.h
+++ b/common/download.h
@ -1,7 +1,10 @@
 #pragma once

+#include "hf-cache.h"
+
 #include <string>
 #include <vector>
+#include <functional>

 struct common_params_model;

@ -47,66 +50,40 @@ struct common_cached_model_info {
    }
 };

-// Options for common_download_model and common_download_file_single
+// Options for common_download_file_single
 struct common_download_opts {
    std::string bearer_token;
    common_header_list headers;
    bool offline = false;
-    bool skip_download = false; // if true, only validation is performed, common_skip_download_exception may be thrown if the file is missing or invalid
    bool download_mmproj = false;
    bool download_mtp = false;
    common_download_callback * callback = nullptr;
 };

-// Result of common_download_model
-struct common_download_model_result {
-    std::string model_path;
-    std::string mmproj_path;
-    std::string mtp_path;
-    std::string preset_path;
+struct common_download_task {
+    common_download_opts opts;
+    std::string url;
+    std::string local_path;
+    std::function<void()> on_done;
+    bool is_hf = false;
+
+    common_download_task() = default;
+    common_download_task(hf_cache::hf_file f,
+            const common_download_opts & opts,
+            std::function<void()> on_done = nullptr)
+        : opts(opts), url(f.url), local_path(f.local_path), on_done(on_done), is_hf(true) {}
 };

-// throw if the file is missing or invalid (e.g. ETag check failed)
-struct common_skip_download_exception : public std::runtime_error {
-    common_skip_download_exception() : std::runtime_error("skip download") {}
-};
+void common_download_run_tasks(const std::vector<common_download_task> & tasks);

-// Download model from HuggingFace repo or URL
-//
-// input (via model struct):
-// - model.hf_repo: HF repo with optional tag, see common_download_split_repo_tag
-// - model.hf_file: specific file in the repo (requires hf_repo)
-// - model.url: simple download (used if hf_repo is empty)
-// - model.path: local file path
-//
-// tag matching (for HF repos without model.hf_file):
-// - if tag is specified, searches for GGUF matching that quantization
-// - if no tag, searches for Q4_K_M, then Q4_0, then first available GGUF
-//
-// split GGUF: multi-part files like "model-00001-of-00003.gguf" are automatically
-// detected and all parts are downloaded
-//
-// caching:
-// - HF repos: uses HuggingFace cache
-// - URLs: uses ETag-based caching
-//
-// when opts.offline=true, no network requests are made
-// when download_mmproj=true, searches for mmproj in same directory as model or any parent directory
-// then with the closest quantization bits
-// when download_mtp=true, applies the same sibling search for an MTP-head GGUF
-//
-// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure)
-common_download_model_result common_download_model(
-    const common_params_model & model,
-    const common_download_opts & opts = {}
-);
+// if url is a multi-part GGUF file, returns all parts, otherwise returns the single file
+std::vector<std::string> common_download_get_all_parts(const std::string & url);

 // returns list of cached models
 std::vector<common_cached_model_info> common_list_cached_models();

 // download single file from url to local path
 // returns status code or -1 on error
-// returns -2 if the download was skipped due to ETag mismatch (file outdated, skip_download=true)
 // skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash)
 int common_download_file_single(const std::string & url,
                                const std::string & path,
@ -123,3 +100,12 @@ std::string common_docker_resolve_model(const std::string & docker);
 // - if tag is present, removes only files matching that tag (and orphaned blobs)
 // returns true if anything was removed
 bool common_download_remove(const std::string & hf_repo_with_tag);
+
+struct common_download_hf_plan {
+    hf_cache::hf_file primary;
+    hf_cache::hf_files model_files;
+    hf_cache::hf_file mmproj;
+    hf_cache::hf_file mtp;
+    hf_cache::hf_file preset; // if set, only this file is downloaded
+};
+common_download_hf_plan common_download_get_hf_plan(const common_params_model & model, const common_download_opts & opts);
--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@ -686,59 +686,62 @@ value set_statement::execute_impl(context & ctx) {
    return mk_val<value_undefined>();
 }

+static inline void bind_parameters(const std::string & name, const statements & this_args, const func_args & args, context & ctx) {
+    const size_t expected_count = this_args.size();
+    const size_t input_count = args.count();
+
+    JJ_DEBUG("Invoking '%s' with %zu input arguments (expected %zu)", name.c_str(), input_count, expected_count);
+    for (size_t i = 0; i < expected_count; ++i) {
+        if (i < input_count) {
+            if (is_stmt<identifier>(this_args[i])) {
+                // normal parameter
+                std::string param_name = cast_stmt<identifier>(this_args[i])->val;
+                value param_value = args.get_kwarg_or_pos(param_name, i);
+                JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
+                ctx.set_val(param_name, param_value);
+            } else if (is_stmt<keyword_argument_expression>(this_args[i])) {
+                // default argument used as normal parameter
+                auto kwarg = cast_stmt<keyword_argument_expression>(this_args[i]);
+                if (!is_stmt<identifier>(kwarg->key)) {
+                    throw std::runtime_error("Keyword argument key must be an identifier in '" + name + "'");
+                }
+                std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
+                value param_value = args.get_kwarg_or_pos(param_name, i);
+                JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
+                ctx.set_val(param_name, param_value);
+            } else {
+                throw std::runtime_error("Invalid parameter type in '" + name + "'");
+            }
+        } else {
+            auto & default_arg = this_args[i];
+            if (is_stmt<keyword_argument_expression>(default_arg)) {
+                auto kwarg = cast_stmt<keyword_argument_expression>(default_arg);
+                if (!is_stmt<identifier>(kwarg->key)) {
+                    throw std::runtime_error("Keyword argument key must be an identifier in '" + name + "'");
+                }
+                std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
+                JJ_DEBUG("  Binding parameter '%s' to default argument of type %s", param_name.c_str(), kwarg->val->type().c_str());
+                ctx.set_val(param_name, kwarg->val->execute(args.ctx));
+            } else {
+                throw std::runtime_error("Not enough arguments provided to '" + name + "'");
+            }
+            //std::string param_name = cast_stmt<identifier>(default_args[i])->val;
+            //JJ_DEBUG("  Binding parameter '%s' to default", param_name.c_str());
+            //ctx.var[param_name] = default_args[i]->execute(ctx);
+        }
+    }
+}
+
 value macro_statement::execute_impl(context & ctx) {
    if (!is_stmt<identifier>(this->name)) {
        throw std::runtime_error("Macro name must be an identifier");
    }
    std::string name = cast_stmt<identifier>(this->name)->val;

-    const func_handler func = [this, name, &ctx](const func_args & args) -> value {
-        size_t expected_count = this->args.size();
-        size_t input_count = args.count();
+    const func_handler func = [this, name](const func_args & args) -> value {
+        context macro_ctx(args.ctx); // new scope for macro execution

-        JJ_DEBUG("Invoking macro '%s' with %zu input arguments (expected %zu)", name.c_str(), input_count, expected_count);
-        context macro_ctx(ctx); // new scope for macro execution
-
-        // bind parameters
-        for (size_t i = 0; i < expected_count; ++i) {
-            if (i < input_count) {
-                if (is_stmt<identifier>(this->args[i])) {
-                    // normal parameter
-                    std::string param_name = cast_stmt<identifier>(this->args[i])->val;
-                    value param_value = args.get_kwarg_or_pos(param_name, i);
-                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
-                    macro_ctx.set_val(param_name, param_value);
-                } else if (is_stmt<keyword_argument_expression>(this->args[i])) {
-                    // default argument used as normal parameter
-                    auto kwarg = cast_stmt<keyword_argument_expression>(this->args[i]);
-                    if (!is_stmt<identifier>(kwarg->key)) {
-                        throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
-                    }
-                    std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
-                    value param_value = args.get_kwarg_or_pos(param_name, i);
-                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
-                    macro_ctx.set_val(param_name, param_value);
-                } else {
-                    throw std::runtime_error("Invalid parameter type in macro '" + name + "'");
-                }
-            } else {
-                auto & default_arg = this->args[i];
-                if (is_stmt<keyword_argument_expression>(default_arg)) {
-                    auto kwarg = cast_stmt<keyword_argument_expression>(default_arg);
-                    if (!is_stmt<identifier>(kwarg->key)) {
-                        throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
-                    }
-                    std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
-                    JJ_DEBUG("  Binding parameter '%s' to default argument of type %s", param_name.c_str(), kwarg->val->type().c_str());
-                    macro_ctx.set_val(param_name, kwarg->val->execute(ctx));
-                } else {
-                    throw std::runtime_error("Not enough arguments provided to macro '" + name + "'");
-                }
-                //std::string param_name = cast_stmt<identifier>(default_args[i])->val;
-                //JJ_DEBUG("  Binding parameter '%s' to default", param_name.c_str());
-                //macro_ctx.var[param_name] = default_args[i]->execute(ctx);
-            }
-        }
+        bind_parameters(name, this->args, args, macro_ctx);

        // execute macro body
        JJ_DEBUG("Executing macro '%s' body with %zu statements", name.c_str(), this->body.size());
@ -752,6 +755,46 @@ value macro_statement::execute_impl(context & ctx) {
    return mk_val<value_undefined>();
 }

+value call_statement::execute_impl(context & ctx) {
+    auto call_expr = cast_stmt<call_expression>(this->call);
+    if (!call_expr) {
+        throw std::runtime_error("Call statement requires a valid call expression");
+    }
+
+    value callee_val = call_expr->callee->execute(ctx);
+    if (!is_val<value_func>(callee_val)) {
+        throw std::runtime_error("Callee is not a function: got " + callee_val->type());
+    }
+    auto * callee_func = cast_val<value_func>(callee_val);
+
+    context caller_ctx(ctx); // new scope for caller execution
+
+    const func_handler func = [this, caller_ctx = std::move(caller_ctx)](const func_args & args) -> value {
+        context block_ctx(caller_ctx); // new scope for block execution
+
+        bind_parameters("caller", this->caller_args, args, block_ctx);
+
+        JJ_DEBUG("Executing call body with %zu statements", this->body.size());
+        auto res = exec_statements(this->body, block_ctx);
+        JJ_DEBUG("Call body execution complete, result: %s", res->val_str.str().c_str());
+        return res;
+    };
+
+    context call_ctx(ctx);
+    call_ctx.set_val("caller", mk_val<value_func>("caller", func));
+
+    func_args args(call_ctx);
+
+    for (const auto & arg_expr : call_expr->args) {
+        auto arg_val = arg_expr->execute(ctx);
+        JJ_DEBUG("  Argument type: %s", arg_val->type().c_str());
+        args.push_back(arg_val);
+    }
+
+    JJ_DEBUG("Calling macro '%s' with %zu arguments", callee_func->name.c_str(), args.count());
+    return callee_func->invoke(args);
+}
+
 value member_expression::execute_impl(context & ctx) {
    value object = this->object->execute(ctx);

--- a/common/jinja/runtime.h
+++ b/common/jinja/runtime.h
@ -552,6 +552,7 @@ struct call_statement : public statement {
        for (const auto & arg : this->caller_args) chk_type<expression>(arg);
    }
    std::string type() const override { return "CallStatement"; }
+    value execute_impl(context & ctx) override;
 };

 struct ternary_expression : public expression {
--- a/common/json-partial.cpp
+++ b/common/json-partial.cpp
@ -1,324 +0,0 @@
-#include "json-partial.h"
-
-#include "log.h"
-
-#include <nlohmann/json.hpp>
-
-#include <string>
-#include <regex>
-
-using json = nlohmann::ordered_json;
-
-enum common_json_stack_element_type {
-    COMMON_JSON_STACK_ELEMENT_OBJECT,
-    COMMON_JSON_STACK_ELEMENT_KEY,
-    COMMON_JSON_STACK_ELEMENT_ARRAY,
-};
-
-struct common_json_stack_element {
-    common_json_stack_element_type type;
-    std::string key;
-};
-
-bool common_json_parse(
-    const std::string & input,
-    const std::string & healing_marker,
-    common_json & out)
-{
-    std::string::const_iterator it = input.begin();
-    const auto end = input.end();
-    return common_json_parse(it, end, healing_marker, out);
-}
-
-bool common_json_parse(
-    std::string::const_iterator & it,
-    const std::string::const_iterator & end,
-    const std::string & healing_marker,
-    common_json & out)
-{
-    // // https://json.nlohmann.me/features/parsing/sax_interface/
-    struct json_error_locator : public nlohmann::json_sax<json> {
-        std::size_t position;
-        bool found_error;
-        std::string last_token;
-        std::string exception_message;
-        std::vector<common_json_stack_element> stack;
-
-        json_error_locator() : position(0), found_error(false) {}
-
-        bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
-            this->position = position - 1;
-            this->found_error = true;
-            this->last_token = last_token;
-            this->exception_message = ex.what();
-            return false;
-        }
-        void close_value() {
-            if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
-                stack.pop_back();
-            }
-        }
-        bool null() override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool boolean(bool) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool number_integer(number_integer_t) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool number_unsigned(number_unsigned_t) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool number_float(number_float_t, const string_t &) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool string(string_t &) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool binary(binary_t &) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool start_object(std::size_t) override { // NOLINT
-            stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
-            return true;
-        }
-        bool end_object() override {
-            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
-            stack.pop_back();
-            close_value();
-            return true;
-        }
-        bool key(string_t & key) override { // NOLINT
-            stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
-            return true;
-        }
-        bool start_array(std::size_t) override { // NOLINT
-            stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
-            return true;
-        }
-        bool end_array() override {
-            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
-            stack.pop_back();
-            close_value();
-            return true;
-        }
-    };
-    json_error_locator err_loc;
-    auto start = it;
-    json::sax_parse(it, end, &err_loc);
-
-    if (err_loc.found_error) {
-        it = start;
-        auto temptative_end = it + err_loc.position;
-        // LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
-
-        auto input = std::string(it, temptative_end);
-        try {
-            out.json = json::parse(input);
-            // out.json = json::parse(it, temptative_end);
-            it = temptative_end;
-            return true;
-        } catch (const std::exception & ex) {
-            // No, needs healing.
-            LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
-        }
-        auto can_parse = [](const std::string & str) {
-            try {
-                auto _ = json::parse(str); // NOLINT
-                return true;
-            } catch (const std::exception &) {
-                return false;
-            }
-        };
-        if (!healing_marker.empty() && !err_loc.stack.empty()) {
-            std::string str(it, temptative_end);
-            auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
-            if (last_non_sp_pos == std::string::npos) {
-                throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
-            }
-            auto last_non_sp_char = str[last_non_sp_pos];
-            // Used to detect stops on a number, which may not be complete.
-            auto was_maybe_number = [&]() {
-                if (!str.empty() && std::isspace(str.back())) {
-                    return false;
-                }
-                return std::isdigit(last_non_sp_char) ||
-                    last_non_sp_char == '.' ||
-                    last_non_sp_char == 'e' ||
-                    last_non_sp_char == 'E' ||
-                    last_non_sp_char == '-';
-            };
-
-            std::string closing;
-            for (size_t i = err_loc.stack.size(); i > 0; i--) {
-                auto & el = err_loc.stack[i - 1];
-                if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
-                    closing += "}";
-                } else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
-                    closing += "]";
-                } else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
-                    throw std::runtime_error("Unexpected stack element type");
-                }
-            }
-
-            // Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
-            static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)");
-
-            auto is_high_surrogate = [&](const std::string & s) {
-                // Check if a partial of a high surrogate (U+D800-U+DBFF)
-                return s.length() >= 4 &&
-                    s[0] == '\\' && s[1] == 'u' &&
-                    std::tolower(s[2]) == 'd' &&
-                    (s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b');
-            };
-
-            // Initialize the unicode marker to a low surrogate to handle the edge case
-            // where a high surrogate (U+D800-U+DBFF) is immediately followed by a
-            // backslash (\)
-            std::string unicode_marker_padding = "udc00";
-            std::smatch last_unicode_seq;
-
-            if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) {
-                std::smatch second_last_seq;
-                std::string prelude = str.substr(0, last_unicode_seq.position());
-
-                // Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
-                unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0');
-
-                if (is_high_surrogate(last_unicode_seq.str())) {
-                    // If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
-                    unicode_marker_padding += "\\udc00";
-                } else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) {
-                    if (is_high_surrogate(second_last_seq.str())) {
-                        // If this follows a high surrogate, pad it to be a low surrogate
-                        if (last_unicode_seq.length() == 2) {
-                            unicode_marker_padding = "dc00";
-                        } else if (last_unicode_seq.length() == 3) {
-                            unicode_marker_padding = "c00";
-                        } else {
-                            // The original unicode_marker_padding is already padded with 0s
-                        }
-                    }
-                }
-            }
-
-            const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
-
-            if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
-                // We're inside an object value
-                if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
-                    // Was about to create an object value
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                } else if (can_parse(str + ": 1" + closing)) {
-                    str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
-                } else if (last_non_sp_char == '{' && can_parse(str + closing)) {
-                    // Was about to create an object
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
-                } else if (can_parse(str + "\"" + closing)) {
-                    // Was inside an object value string
-                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
-                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
-                    // Was inside an object value string after an escape
-                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
-                } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
-                    // Was inside an object value string after a partial unicode escape
-                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
-                } else {
-                    // find last :
-                    auto last_pos = str.find_last_of(':');
-                    if (last_pos == std::string::npos) {
-                        throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
-                    }
-                    // Cutting back to opening : for object value
-                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                }
-            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
-                if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
-                    // Was about to create an array value
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                } else if (can_parse(str + "\"" + closing)) {
-                    // Was inside an array value string
-                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
-                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
-                    // Was inside an array value string after an escape
-                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
-                } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
-                    // Was inside an array value string after a partial unicode escape
-                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
-                } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
-                    // Had just finished a value
-                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
-                } else {
-                    auto last_pos = str.find_last_of("[,");
-                    if (last_pos == std::string::npos) {
-                        throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
-                    }
-                    // Cutting back to last [ or , for array value
-                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                }
-            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
-                if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
-                        (last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
-                    // Was about to create an object key+value
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
-                } else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
-                    // Was about to create an object key+value
-                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
-                } else if (can_parse(str + "\": 1" + closing)) {
-                    // Was inside an object key string
-                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
-                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
-                    // Was inside an object key string after an escape
-                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
-                } else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) {
-                    // Was inside an object key string after a partial unicode escape
-                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing;
-                } else {
-                    auto last_pos = str.find_last_of(':');
-                    if (last_pos == std::string::npos) {
-                        throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
-                    }
-                    // fprintf(stderr, "Cutting back to last : for object key+value\n");
-                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                }
-            } else {
-                throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
-            }
-            // fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
-            out.json = json::parse(str);
-            it = temptative_end;
-            return true;
-        }
-        // handle unclosed top-level primitive
-        if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) {
-            std::string str(it, temptative_end);
-            const auto & magic_seed = out.healing_marker.marker = healing_marker;
-            if (can_parse(str + "\"")) {
-                // Was inside an string
-                str += (out.healing_marker.json_dump_marker = magic_seed) + "\"";
-            } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) {
-                // Was inside an string after an escape
-                str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"";
-            } else {
-                // TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
-                // fprintf(stderr, "Closing: TODO\n");
-                return false;
-            }
-            out.json = json::parse(str);
-            it = temptative_end;
-            return true;
-        }
-        return false;
-    }
-    out.json = json::parse(it, end);
-    it = end;
-    return true;
-}
--- a/common/json-partial.h
+++ b/common/json-partial.h
@ -1,39 +0,0 @@
-#pragma once
-
-// TODO: use json_fwd.hpp when possible
-#include <nlohmann/json.hpp>
-
-// Healing marker (empty if the JSON was fully parsed / wasn't healed).
-struct common_healing_marker {
-    // Raw marker.
-    std::string marker;
-
-    // Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
-    std::string json_dump_marker;
-};
-
-// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
-struct common_json {
-    nlohmann::ordered_json json;
-
-    common_healing_marker healing_marker;
-};
-
-// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
-//
-// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
-// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
-// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
-//
-// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
-bool common_json_parse(
-    const std::string & input,
-    const std::string & healing_marker,
-    common_json & out);
-
-// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
-bool common_json_parse(
-    std::string::const_iterator & it,
-    const std::string::const_iterator & end,
-    const std::string & healing_marker,
-    common_json & out);
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -233,27 +233,27 @@ struct BuiltinRule {
 };

 static std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
-    {"boolean", {"(\"true\" | \"false\") space", {}}},
+    {"boolean", {"(\"true\" | \"false\")", {}}},
    {"decimal-part", {"[0-9]{1,16}", {}}},
    {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
-    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
-    {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
+    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)?", {"integral-part", "decimal-part"}}},
+    {"integer", {"(\"-\"? integral-part)", {"integral-part"}}},
    {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
-    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
-    {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
-    {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
+    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? space \"}\"", {"string", "value"}}},
+    {"array", {"\"[\" space ( value (\",\" space value)* )? space \"]\"", {"value"}}},
+    {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\"", {}}},
    {"char",   {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
-    {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
-    {"null", {"\"null\" space", {}}},
+    {"string", {"\"\\\"\" char* \"\\\"\"", {"char"}}},
+    {"null", {"\"null\"", {}}},
 };

 static std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
    {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
    {"date-time", {"date \"T\" time", {"date", "time"}}},
-    {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
-    {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
-    {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
+    {"date-string", {"\"\\\"\" date \"\\\"\"", {"date"}}},
+    {"time-string", {"\"\\\"\" time \"\\\"\"", {"time"}}},
+    {"date-time-string", {"\"\\\"\" date-time \"\\\"\"", {"date-time"}}}
 };

 static bool is_reserved_name(const std::string & name) {
@ -551,16 +551,16 @@ private:
            }
            return join_seq();
        };
-        return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
+        return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"");
    }

    /*
        Returns a rule that matches a JSON string that is none of the provided strings

        not_strings({"a"})
-            -> ["] ( [a] char+ | [^"a] char* )? ["] space
+            -> ["] ( [a] char+ | [^"a] char* )? ["]
        not_strings({"and", "also"})
-            -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
+            -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["]
    */
    std::string _not_strings(const std::vector<std::string> & strings) {

@ -619,7 +619,7 @@ private:
        if (!trie.is_end_of_string) {
            out << "?";
        }
-        out << " [\"] space";
+        out << " [\"]";
        return out.str();
    }

@ -725,7 +725,7 @@ private:
            rule += " )?";
        }

-        rule += " \"}\" space";
+        rule += " space \"}\"";

        return rule;
    }
@ -858,14 +858,14 @@ public:
            return _add_rule(rule_name, _generate_union_rule(name, schema_types));
        }
        if (schema.contains("const")) {
-            return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
+            return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
        }
        if (schema.contains("enum")) {
            std::vector<std::string> enum_values;
            for (const auto & v : schema["enum"]) {
                enum_values.push_back(_generate_constant_rule(v));
            }
-            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
+            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ")");
        }
        if ((schema_type.is_null() || schema_type == "object")
                && (schema.contains("properties") ||
@ -933,7 +933,7 @@ public:
                    }
                }
                if (!enum_intersection.empty()) {
-                    return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
+                    return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ")");
                }
            }
            return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
@ -948,7 +948,7 @@ public:
                    }
                    rule += visit(items[i], name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(i));
                }
-                rule += " \"]\" space";
+                rule += " space \"]\"";
                return _add_rule(rule_name, rule);
            }
            std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
@ -956,7 +956,7 @@ public:
            json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
            int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();

-            return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
+            return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " space \"]\"");
        }
        if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
            return _visit_pattern(schema["pattern"], rule_name);
@ -972,7 +972,7 @@ public:
            std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
            int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
-            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
+            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\"");
        }
        if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
            int64_t min_value = std::numeric_limits<int64_t>::min();
@ -990,7 +990,7 @@ public:
            std::stringstream out;
            out << "(";
            build_min_max_int(min_value, max_value, out);
-            out << ") space";
+            out << ")";
            return _add_rule(rule_name, out.str());
        }
        if (schema.empty() || schema_type == "object") {
--- a/common/peg-parser.cpp
+++ b/common/peg-parser.cpp
@ -6,13 +6,14 @@
 #include "unicode.h"

 #include <algorithm>
+#include <deque>
 #include <initializer_list>
 #include <map>
 #include <memory>
 #include <nlohmann/json.hpp>
 #include <regex>
+#include <set>
 #include <stdexcept>
-#include <unordered_set>

 // Trick to catch missing branches
 template <typename T>
@ -88,40 +89,7 @@ struct trie {
        return match_result{match_result::NO_MATCH};
    }

-    struct prefix_and_next {
-        std::vector<uint32_t> prefix;
-        std::vector<uint32_t> next_chars;
-    };
-
-    std::vector<prefix_and_next> collect_prefix_and_next() {
-        std::vector<uint32_t>        prefix;
-        std::vector<prefix_and_next> result;
-        collect_prefix_and_next(0, prefix, result);
-        return result;
-    }
-
  private:
-    void collect_prefix_and_next(size_t index, std::vector<uint32_t> & prefix, std::vector<prefix_and_next> & out) {
-        if (!nodes[index].is_word) {
-            if (!nodes[index].children.empty()) {
-                std::vector<uint32_t> chars;
-                chars.reserve(nodes[index].children.size());
-                for (const auto & p : nodes[index].children) {
-                    chars.push_back(p.first);
-                }
-                out.emplace_back(prefix_and_next{prefix, chars});
-            }
-        }
-
-        for (const auto & p : nodes[index].children) {
-            uint32_t ch = p.first;
-            auto child = p.second;
-            prefix.push_back(ch);
-            collect_prefix_and_next(child, prefix, out);
-            prefix.pop_back();
-        }
-    }
-
    size_t create_node() {
        size_t index = nodes.size();
        nodes.emplace_back();
@ -153,6 +121,65 @@ struct trie {
    }
 };

+// Aho-Corasick automaton
+struct aho_corasick {
+    trie                t;
+    std::vector<size_t> fail;      // failure links
+    std::vector<size_t> order;     // states in BFS order
+    std::vector<bool>   terminal;  // match states (directly or via a suffix link)
+    std::set<uint32_t>  alphabet;  // every character with a transition
+
+    aho_corasick(const std::vector<std::string> & strings) : t(strings) {
+        const auto & nodes = t.nodes;
+        const size_t n = nodes.size();
+
+        fail.assign(n, 0);
+        order.reserve(n);
+
+        std::deque<size_t> queue{ 0 };
+        while (!queue.empty()) {
+            size_t u = queue.front();
+            queue.pop_front();
+            order.push_back(u);
+            for (const auto & [ch, v] : nodes[u].children) {
+                if (u != 0) {
+                    size_t f = fail[u];
+                    while (f && nodes[f].children.find(ch) == nodes[f].children.end()) {
+                        f = fail[f];
+                    }
+                    auto it = nodes[f].children.find(ch);
+                    fail[v] = (it != nodes[f].children.end() && it->second != v) ? it->second : 0;
+                }
+                queue.push_back(v);
+            }
+        }
+
+        terminal.assign(n, false);
+        for (size_t u : order) {
+            terminal[u] = nodes[u].is_word || (u != 0 && terminal[fail[u]]);
+        }
+
+        for (const auto & node : nodes) {
+            for (const auto & [ch, v] : node.children) {
+                alphabet.insert(ch);
+            }
+        }
+    }
+
+    size_t num_states()          const { return t.nodes.size(); }
+    bool   is_terminal(size_t s) const { return terminal[s]; }
+
+    // follow failure links until a transition on `ch` exists.
+    size_t next(size_t state, uint32_t ch) const {
+        const auto & nodes = t.nodes;
+        while (state && nodes[state].children.find(ch) == nodes[state].children.end()) {
+            state = fail[state];
+        }
+        auto it = nodes[state].children.find(ch);
+        return it != nodes[state].children.end() ? it->second : 0;
+    }
+};
+
 static std::pair<uint32_t, size_t> parse_hex_escape(const std::string & str, size_t pos, int hex_count) {
    if (pos + hex_count > str.length()) {
        return {0, 0};
@ -894,6 +921,10 @@ struct parser_executor {
    common_peg_parse_result operator()(const common_peg_gbnf_parser & p) {
        return arena.parse(p.child, ctx, start_pos);
    }
+
+    common_peg_parse_result operator()(const common_peg_ac_parser & p) {
+        return arena.parse(p.child, ctx, start_pos);
+    }
 };

 common_peg_parse_result common_peg_arena::parse(common_peg_parse_context & ctx, size_t start) const {
@ -962,7 +993,8 @@ void common_peg_arena::resolve_refs() {
                                 std::is_same_v<T, common_peg_not_parser> ||
                                 std::is_same_v<T, common_peg_tag_parser> ||
                                 std::is_same_v<T, common_peg_atomic_parser> ||
-                                 std::is_same_v<T, common_peg_gbnf_parser>) {
+                                 std::is_same_v<T, common_peg_gbnf_parser> ||
+                                 std::is_same_v<T, common_peg_ac_parser>) {
                p.child = resolve_ref(p.child);
            } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
                p.child = resolve_ref(p.child);
@ -992,12 +1024,12 @@ void common_peg_arena::resolve_refs() {
 }

 std::string common_peg_arena::dump(common_peg_parser_id id) const {
-    std::unordered_set<common_peg_parser_id> visited;
+    std::set<common_peg_parser_id> visited;
    return dump_impl(id, visited);
 }

 std::string common_peg_arena::dump_impl(common_peg_parser_id                       id,
-                                        std::unordered_set<common_peg_parser_id> & visited) const {
+                                        std::set<common_peg_parser_id> & visited) const {
    // Check for cycles
    if (visited.count(id)) {
        return "[cycle]";
@ -1043,6 +1075,8 @@ std::string common_peg_arena::dump_impl(common_peg_parser_id
            return "Atomic(" + dump_impl(p.child, visited) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
            return "Gbnf(" + p.grammar + ", " + dump_impl(p.child, visited) + ")";
+        } else if constexpr (std::is_same_v<T, common_peg_ac_parser>) {
+            return "Ac(" + string_join(p.delimiters, " | ") + ", " + dump_impl(p.child, visited) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
            return "Any";
        } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
@ -1342,7 +1376,7 @@ common_peg_parser common_peg_parser_builder::json_object() {
 common_peg_parser common_peg_parser_builder::json_array() {
    return rule("json-array", [this]() {
        auto ws = space();
-        auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))});
+        auto elements = sequence({json(), zero_or_more(sequence({ws, literal(","), ws, json()}))});
        return sequence({
            literal("["),
            ws,
@ -1452,6 +1486,13 @@ common_peg_parser common_peg_parser_builder::json_member(const std::string & key
    });
 }

+common_peg_parser common_peg_parser_builder::ac(const common_peg_parser & p, const std::vector<std::string> & delimiters) {
+    if (delimiters.empty()) {
+        throw std::runtime_error("ac parser requires at least one delimiter");
+    }
+    return add(common_peg_ac_parser{p, delimiters});
+}
+
 static std::string gbnf_escape_char_class(uint32_t c) {
    if (c == '-' || c == ']' || c == '[' || c == '\\') {
        return "\\" + std::string(1, (char) c);
@ -1502,61 +1543,118 @@ static std::string gbnf_escape_char_class(uint32_t c) {
    return std::string(buf);
 }

-static std::string gbnf_excluding_pattern(const std::vector<std::string> & strings) {
-    trie matcher(strings);
-    auto pieces = matcher.collect_prefix_and_next();
-
-    std::string pattern;
-    std::string trailing;  // optional proper-prefix of a delimiter, allowed only at the very end
-    for (size_t i = 0; i < pieces.size(); ++i) {
-        if (i > 0) {
-            pattern += " | ";
-        }
-
-        const auto & pre = pieces[i].prefix;
-        const auto & chars = pieces[i].next_chars;
-
-        std::string cls;
-        cls.reserve(chars.size());
-        for (uint32_t ch : chars) {
-            cls += gbnf_escape_char_class(ch);
-        }
-
-        if (!pre.empty()) {
-            std::string pre_literal = gbnf_format_literal(common_unicode_cpts_to_utf8(pre));
-            pattern += pre_literal + " [^" + cls + "]";
-            // Each interior alternative consumes a delimiter-prefix plus a disambiguating
-            // char, so the repetition alone cannot match a value that *ends* on a proper
-            // prefix of a delimiter (e.g. a trailing "\n" when the delimiter is
-            // "\n</parameter>\n"). The runtime until() (greedy first-match) accepts such
-            // values, so without this the grammar would reject input the parser accepts.
-            // Allow the value to terminate on any proper prefix as an optional tail.
-            // This makes the grammar a slight superset of the runtime language (a value
-            // may end on the longest prefix, which greedy first-match would not itself
-            // produce); harmless for constrained generation, which only needs to admit
-            // every runtime-valid string.
-            if (!trailing.empty()) {
-                trailing += " | ";
-            }
-            trailing += pre_literal;
-        } else {
-            pattern += "[^" + cls + "]";
-        }
+static std::string gbnf_char_class(const std::vector<uint32_t> & chars, bool negate) {
+    std::string s = negate ? "[^" : "[";
+    for (uint32_t ch : chars) {
+        s += gbnf_escape_char_class(ch);
    }
-
-    std::string result = "(" + pattern + ")*";
-    if (!trailing.empty()) {
-        result += " (" + trailing + ")?";
-    }
-    return result;
+    return s + "]";
 }

-static std::unordered_set<std::string> collect_reachable_rules(
+static std::string gbnf_ac_grammar(
+    const common_grammar_builder &   builder,
+    const std::string &              prefix,
+    const std::vector<std::string> & strings,
+    const std::function<std::string(const std::vector<uint32_t> &,
+                                    const std::map<size_t, std::vector<uint32_t>> &,
+                                    const std::vector<uint32_t> &,
+                                    const std::function<std::string(size_t)> &)> & build_rule) {
+    aho_corasick ac(strings);
+
+    auto state_name = [&](size_t s) -> std::string {
+        if (s == 0) {
+            return prefix;
+        }
+        std::string num = std::to_string(s);
+        num = num.size() == 1 ? ("0" + num) : num;
+        return prefix + "-" + num;
+    };
+
+    for (size_t q = 0; q < ac.num_states(); q++) {
+        if (ac.is_terminal(q)) {
+            continue; // match states
+        }
+
+        std::map<size_t, std::vector<uint32_t>> buckets;
+        std::vector<uint32_t> completing;  // chars that complete a delimiter
+        std::vector<uint32_t> specific;    // chars with an explicit transition
+        for (uint32_t c : ac.alphabet) {
+            size_t d = ac.next(q, c);
+            if (ac.is_terminal(d)) {
+                completing.push_back(c);
+                specific.push_back(c);
+            } else if (d != 0) {
+                buckets[d].push_back(c); // specific non-root destination
+                specific.push_back(c);
+            }
+        }
+
+        builder.add_rule(state_name(q), build_rule(completing, buckets, specific, state_name));
+    }
+
+    // An empty delimiter makes the start state terminal. Emit an entry rule
+    // that matches the empty string so the returned reference stays valid.
+    if (ac.is_terminal(0)) {
+        builder.add_rule(prefix, "|");
+    }
+
+    return state_name(0);
+}
+
+// GBNF grammar matching strings that contain no string in `strings` as a
+// substring. Emits the complement of an Aho-Corasick automaton DFA and returns
+// the start state rule name.
+//
+// ref: https://github.com/ggml-org/llama.cpp/pull/24839
+static std::string gbnf_excluding_grammar(const common_grammar_builder & builder,
+                                          const std::string &            prefix,
+                                          const std::vector<std::string> & strings) {
+    return gbnf_ac_grammar(builder, prefix, strings,
+        [](const std::vector<uint32_t> & /*completing*/,
+           const std::map<size_t, std::vector<uint32_t>> & buckets,
+           const std::vector<uint32_t> & specific,
+           const std::function<std::string(size_t)> & state_name) {
+            // every state is accepting and completing chars get no
+            // alternative, so a forbidden string can never be matched
+            std::string rhs = "|";
+            for (const auto & [d, chars] : buckets) {
+                rhs += " " + gbnf_char_class(chars, false) + " " + state_name(d) + " |";
+            }
+            rhs += " " + gbnf_char_class(specific, true) + " " + state_name(0);
+            return rhs;
+        });
+}
+
+// GBNF grammar matching everything up to and including the first occurrence of
+// any string in `strings`. Emits the Aho-Corasick automaton DFA and returns
+// the start state rule name.
+static std::string gbnf_including_grammar(const common_grammar_builder & builder,
+                                          const std::string &            prefix,
+                                          const std::vector<std::string> & strings) {
+    return gbnf_ac_grammar(builder, prefix, strings,
+        [](const std::vector<uint32_t> & completing,
+           const std::map<size_t, std::vector<uint32_t>> & buckets,
+           const std::vector<uint32_t> & specific,
+           const std::function<std::string(size_t)> & state_name) {
+            std::vector<std::string> alts;
+            if (!completing.empty()) {
+                alts.push_back(gbnf_char_class(completing, false)); // terminate on match
+            }
+            for (const auto & [d, chars] : buckets) {
+                alts.push_back(gbnf_char_class(chars, false) + " " + state_name(d));
+            }
+            // every other character keeps scanning from the start state
+            alts.push_back(gbnf_char_class(specific, true) + " " + state_name(0));
+            return string_join(alts, " | ");
+        });
+}
+
+static std::set<std::string> collect_reachable_rules(
    const common_peg_arena & arena,
    const common_peg_parser_id & rule
 ) {
-    std::unordered_set<std::string> reachable;
-    std::unordered_set<std::string> visited;
+    std::set<std::string> reachable;
+    std::set<std::string> visited;

    std::function<void(common_peg_parser_id)> visit = [&](common_peg_parser_id id) {
        const auto & parser = arena.get(id);
@ -1588,6 +1686,7 @@ static std::unordered_set<std::string> collect_reachable_rules(
                                 std::is_same_v<T, common_peg_tag_parser> ||
                                 std::is_same_v<T, common_peg_atomic_parser> ||
                                 std::is_same_v<T, common_peg_gbnf_parser> ||
+                                 std::is_same_v<T, common_peg_ac_parser> ||
                                 std::is_same_v<T, common_peg_schema_parser>) {
                visit(p.child);
            } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
@ -1765,7 +1864,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
                if (p.delimiters.empty()) {
                    return ".*";
                }
-                return gbnf_excluding_pattern(p.delimiters);
+                return gbnf_excluding_grammar(builder, "until-" + std::to_string(id), p.delimiters);
            } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
                if (schema_delegates(p)) {
                    return to_gbnf(p.child);
@ -1782,6 +1881,8 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
                return to_gbnf(p.child);
            } else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
                return p.grammar;
+            } else if constexpr (std::is_same_v<T, common_peg_ac_parser>) {
+                return gbnf_including_grammar(builder, "ac-" + std::to_string(id), p.delimiters);
            } else {
                static_assert(is_always_false_v<T>);
            }
@ -1789,7 +1890,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
    };

    // Collect reachable rules
-    std::unordered_set<std::string> reachable_rules;
+    std::set<std::string> reachable_rules;

    if (lazy) {
        // Collect rules reachable from trigger rules
@ -1918,6 +2019,8 @@ static nlohmann::json serialize_parser_variant(const common_peg_parser_variant &
            };
        } else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
            return json{{"type", "gbnf"}, {"child", p.child}, {"grammar", p.grammar}};
+        } else if constexpr (std::is_same_v<T, common_peg_ac_parser>) {
+            return json{{"type", "ac"}, {"child", p.child}, {"delimiters", p.delimiters}};
        }
    }, variant);
 }
@ -2090,6 +2193,16 @@ static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json
        };
    }

+    if (type == "ac") {
+        if (!j.contains("child") || !j.contains("delimiters") || !j["delimiters"].is_array() || j["delimiters"].empty()) {
+            throw std::runtime_error("ac parser requires 'child' and a non-empty 'delimiters' array");
+        }
+        return common_peg_ac_parser{
+            j["child"].get<common_peg_parser_id>(),
+            j["delimiters"].get<std::vector<std::string>>(),
+        };
+    }
+
    throw std::runtime_error("Unknown parser type: " + type);
 }

--- a/common/peg-parser.h
+++ b/common/peg-parser.h
@ -3,8 +3,8 @@
 #include <nlohmann/json_fwd.hpp>

 #include <memory>
+#include <set>
 #include <unordered_map>
-#include <unordered_set>
 #include <string>
 #include <string_view>
 #include <functional>
@ -275,6 +275,11 @@ struct common_peg_gbnf_parser {
    std::string grammar;
 };

+struct common_peg_ac_parser {
+    common_peg_parser_id child;
+    std::vector<std::string> delimiters;
+};
+
 // Variant holding all parser types
 using common_peg_parser_variant = std::variant<
    common_peg_epsilon_parser,
@ -296,7 +301,8 @@ using common_peg_parser_variant = std::variant<
    common_peg_ref_parser,
    common_peg_atomic_parser,
    common_peg_tag_parser,
-    common_peg_gbnf_parser
+    common_peg_gbnf_parser,
+    common_peg_ac_parser
 >;

 class common_peg_arena {
@ -335,7 +341,7 @@ class common_peg_arena {
    friend class common_peg_parser_builder;

  private:
-    std::string dump_impl(common_peg_parser_id id, std::unordered_set<common_peg_parser_id> & visited) const;
+    std::string dump_impl(common_peg_parser_id id, std::set<common_peg_parser_id> & visited) const;

    common_peg_parser_id add_parser(common_peg_parser_variant parser);
    void add_rule(const std::string & name, common_peg_parser_id id);
@ -514,6 +520,13 @@ class common_peg_parser_builder {
    // the child's grammar. Parsing delegates entirely to the child.
    common_peg_parser gbnf(const common_peg_parser & p, const std::string & grammar) { return add(common_peg_gbnf_parser{p, grammar}); }

+    // Wraps a child parser but emits a GBNF grammar built from the Aho-Corasick
+    // automaton of `delimiters`, matching everything up to and including the
+    // first delimiter. Parsing delegates entirely to the child, which is
+    // responsible for consuming the delimiter (e.g. until(D) + literal(D)).
+    common_peg_parser ac(const common_peg_parser & p, const std::vector<std::string> & delimiters);
+    common_peg_parser ac(const common_peg_parser & p, const std::string & delimiter) { return ac(p, std::vector<std::string>{delimiter}); }
+
    void set_root(const common_peg_parser & p);

    common_peg_arena build();
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -161,6 +161,10 @@ struct common_speculative_impl {

    virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0;

+    // (optional) serialize/restore per-seq internal state (e.g. eagle3's deferred boundary).
+    virtual bool get_state(llama_seq_id /*seq_id*/, std::vector<uint8_t> & /*data*/) const { return false; }
+    virtual void set_state(llama_seq_id /*seq_id*/, const std::vector<uint8_t> & /*data*/) {}
+
    // true if this implementation requires the target context to extract post-norm embeddings
    virtual bool need_embd() const = 0;

@ -841,6 +845,49 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
                    (size_t) n_embd_dec * sizeof(float));
    }

+    // we only need to stash the deferred boundary's g_embd row for recurrent/hybrid targets:
+    // their single-position checkpoints drop it on restore
+    bool need_boundary_stash() const {
+        const llama_model * model_tgt = llama_get_model(params.ctx_tgt);
+        return llama_model_is_recurrent(model_tgt) || llama_model_is_hybrid(model_tgt);
+    }
+
+    bool get_state(llama_seq_id seq_id, std::vector<uint8_t> & data) const override {
+        if (!need_boundary_stash()) {
+            return false;
+        }
+        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || pending_pos_last[seq_id] < 0) {
+            return false;
+        }
+
+        const llama_pos          pos = pending_pos_last[seq_id];
+        const std::vector<float> & g = pending_g_last[seq_id];
+
+        data.resize(sizeof(llama_pos) + g.size() * sizeof(float));
+        std::memcpy(data.data(),                     &pos,     sizeof(llama_pos));
+        std::memcpy(data.data() + sizeof(llama_pos), g.data(), g.size() * sizeof(float));
+        return true;
+    }
+
+    void set_state(llama_seq_id seq_id, const std::vector<uint8_t> & data) override {
+        if (!need_boundary_stash()) {
+            return;
+        }
+        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
+            return;
+        }
+        if (data.size() != sizeof(llama_pos) + (size_t) n_embd_dec * sizeof(float)) {
+            return;
+        }
+
+        llama_pos pos = -1;
+        std::memcpy(&pos, data.data(), sizeof(llama_pos));
+
+        pending_pos_last[seq_id] = pos;
+        pending_g_last[seq_id].resize(n_embd_dec);
+        std::memcpy(pending_g_last[seq_id].data(), data.data() + sizeof(llama_pos), (size_t) n_embd_dec * sizeof(float));
+    }
+
    bool need_embd() const override {
        return false;
    }
@ -858,7 +905,13 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {

    int32_t n_embd = 0;

-    bool is_mem_shared = false;
+    // One MTP draft driver, three modes (set once in the ctor):
+    //   is_mem_shared (gemma4): shares the target KV, runs all heads in one graph.
+    //   chain_heads (step35): n_mtp_layers trained heads, one per draft step.
+    //   neither (qwen35 / qwen35moe): a single trained MTP head.
+    int32_t n_mtp_layers  = 1;
+    bool    is_mem_shared = false;   // gemma4
+    bool    chain_heads   = false;   // derived in the ctor: n_mtp_layers > 1 && !is_mem_shared

    // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1.
    // The last h-row of one process() call needs the first token of the NEXT
@ -873,10 +926,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
    std::vector<std::vector<float>> verify_h;
    std::vector<int32_t> verify_h_rows;

-    // Per-seq draft length from the last draft() call, used in accept() to
-    // roll back ctx_dft's recurrent state past the AR draft's redundant
-    // pre-advancement before process() mirrored the verify batch.
-    std::vector<uint16_t> last_n_drafted;
+    std::vector<int>                i_last;
+    std::vector<std::vector<float>> chain_h;

    common_speculative_impl_draft_mtp(const common_params_speculative & params, uint32_t n_seq)
        : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, n_seq)
@ -889,6 +940,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
        n_embd = llama_model_n_embd_out(llama_get_model(ctx_dft));
        GGML_ASSERT(n_embd == llama_model_n_embd(llama_get_model(ctx_tgt)) &&
                "MTP input row width must match the target h_nextn width");
+        n_mtp_layers = std::max(1, (int) llama_model_n_layer_nextn(llama_get_model(ctx_dft)));

        LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
@ -935,16 +987,25 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
        llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);

        is_mem_shared = llama_get_ctx_other(ctx_dft) == ctx_tgt;
+        chain_heads   = n_mtp_layers > 1 && !is_mem_shared;
+
+        if (chain_heads) {
+            this->params.n_max = std::min(this->params.n_max, n_mtp_layers);
+
+            chain_h.assign(n_seq, {});
+            for (auto & c : chain_h) {
+                c.reserve((size_t) (this->params.n_max + 1) * n_embd);
+            }
+        }

        pending_h.assign(n_seq, std::vector<float>(n_embd, 0.0f));

+        i_last.assign(n_seq, -1);
        i_batch_beg.assign(n_seq, -1);
        i_batch_end.assign(n_seq, -1);

        verify_h.assign(n_seq, {});
        verify_h_rows.assign(n_seq, 0);
-
-        last_n_drafted.assign(n_seq, 0);
    }

    ~common_speculative_impl_draft_mtp() override {
@ -1050,9 +1111,34 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
                set_h(i_batch_beg[seq_id], pending_h[seq_id].data());
            }

-            const int32_t rc = llama_decode(ctx_dft, batch);
-            if (rc != 0) {
-                LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]);
+            auto * mem_dft = llama_get_memory(ctx_dft);
+
+            bool ok = true;
+            for (int head = 0; head < n_mtp_layers; ++head) {
+                if (chain_heads) {
+                    // ref: https://github.com/ggml-org/llama.cpp/pull/24340/changes#r3413498544
+                    for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                        if (i_batch_beg[seq_id] < 0) {
+                            continue;
+                        }
+                        llama_memory_seq_rm(mem_dft, seq_id, batch_in.pos[i_batch_beg[seq_id]], -1);
+                    }
+                    llama_set_nextn_layer_offset(ctx_dft, head);
+                }
+
+                const int32_t rc = llama_decode(ctx_dft, batch);
+                if (rc != 0) {
+                    LOG_ERR("%s: llama_decode(ctx_dft) head=%d failed rc=%d (pos=%d)\n",
+                            __func__, head, (int) rc, (int) batch_in.pos[0]);
+                    ok = false;
+                    break;
+                }
+            }
+
+            if (chain_heads) {
+                llama_set_nextn_layer_offset(ctx_dft, 0); // restore default for non-draft decodes
+            }
+            if (!ok) {
                return false;
            }
        }
@ -1087,7 +1173,6 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
        int n_drafting = 0;
        std::vector<bool> drafting(n_seq);

-        const float * h_row = nullptr;
        const size_t row_bytes = (size_t) n_embd * sizeof(float);

        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
@ -1102,22 +1187,43 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
            common_sampler_reset(smpls[seq_id].get());

            common_batch_add(batch, dp.id_last, dp.n_past, { seq_id }, true);
+            std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, pending_h[seq_id].data(), row_bytes);

-            h_row = pending_h[seq_id].data();
-            std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes);
-        }
+            i_last[seq_id] = batch.n_tokens - 1;

-        int ret = llama_decode(ctx_dft, batch);
-        if (ret != 0) {
-            LOG_WRN("%s: llama_decode returned %d\n", __func__, ret);
-            return;
+            if (chain_heads) {
+                chain_h[seq_id].assign(pending_h[seq_id].begin(), pending_h[seq_id].end());
+            }
        }

        int i = 0;

        while (n_drafting > 0) {
-            int i_batch = 0;
+            // each step decodes under a different head, i.e. a different decoder layer, and
+            // KV is per layer. process() filled this layer's KV only for positions < n_past
+            // (prompt + accepted prefix) — nothing in the draft region yet. so reset the
+            // draft region (the seq_rm lower bound is n_past, leaving the prompt KV intact)
+            // and select head i so it rebuilds its own layer's KV there; decoding just the
+            // latest token would leave its attention reading cells only another head wrote.
+            if (chain_heads) {
+                auto * mem_dft = llama_get_memory(ctx_dft);
+                for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                    if (drafting[seq_id]) {
+                        llama_memory_seq_rm(mem_dft, seq_id, dparams[seq_id].n_past, -1);
+                    }
+                }
+                llama_set_nextn_layer_offset(ctx_dft, i);
+            }

+            int ret = llama_decode(ctx_dft, batch);
+            if (ret != 0) {
+                LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
+                break;
+            }
+
+            // rebuild the batch for the next step: the growing-KV paths re-add only the
+            // new token (the KV already holds the prefix), while chained heads re-add the
+            // whole prefix at the next head. dropped sequences are simply not re-added.
            common_batch_clear(batch);

            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
@ -1127,9 +1233,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {

                auto * smpl = smpls[seq_id].get();

-                common_sampler_sample(smpl, ctx_dft, i_batch, true);
-                h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_batch);
-                ++i_batch;
+                common_sampler_sample(smpl, ctx_dft, i_last[seq_id], true);
+                const float * h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_last[seq_id]);

                const auto * cur_p = common_sampler_get_candidates(smpl, true);

@ -1163,30 +1268,41 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
                    continue;
                }

-                if (is_mem_shared) {
+                if (chain_heads) {
+                    // ref: https://github.com/ggml-org/llama.cpp/pull/24340#discussion_r3448031546
+                    chain_h[seq_id].insert(chain_h[seq_id].end(), h_row, h_row + n_embd);
+
+                    const int n_rows = (int) result.size() + 1; // id_last + tokens drafted so far
+                    for (int t = 0; t < n_rows; ++t) {
+                        const llama_token tok = (t == 0) ? dp.id_last : result[t - 1];
+                        common_batch_add(batch, tok, dp.n_past + t, { seq_id }, t == n_rows - 1);
+                        std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd,
+                                    chain_h[seq_id].data() + (size_t) t * n_embd, row_bytes);
+                    }
+                } else if (is_mem_shared) {
                    // note: with shared memory (e.g. Gemma4 assistants) we use the same position for all draft tokens
                    // ref: https://github.com/huggingface/transformers/blob/effde20942e3f82a1b97449f60b3a48c5ff96145/docs/source/en/model_doc/gemma4_assistant.md?plain=1#L36-L37
                    common_batch_add(batch, id, dp.n_past, { seq_id }, true);
+                    std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, h_row, row_bytes);
                } else {
                    common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true);
+                    std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, h_row, row_bytes);
                }
-                std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes);
+
+                i_last[seq_id] = batch.n_tokens - 1;
            }

            if (batch.n_tokens == 0) {
                break;
            }

-            // evaluate the drafted tokens on the draft model
-            ret = llama_decode(ctx_dft, batch);
-            if (ret != 0) {
-                LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
-                break;
-            }
-
            ++i;
        }

+        if (chain_heads) {
+            llama_set_nextn_layer_offset(ctx_dft, 0); // restore default for non-draft decodes
+        }
+
        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
            auto & dp = dparams[seq_id];
            if (!dp.drafting) {
@ -1196,8 +1312,6 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
            if (dp.result->size() < (size_t) params.n_min) {
                dp.result->clear();
            }
-
-            last_n_drafted[seq_id] = (uint16_t) dp.result->size();
        }
    }

@ -1810,7 +1924,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,

        bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
        bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr;
-        bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;
+        bool has_draft_mtp    = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP))    && params.draft.ctx_dft != nullptr;



@ -1848,7 +1962,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
        if (has_draft_eagle3) {
            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, params));
        }
-        if (has_mtp) {
+        if (has_draft_mtp) {
            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, params));
        }
    }
@ -2118,6 +2232,31 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
    }
 }

+// TODO: support the case of more than one speculative implementations having a state
+bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data) {
+    if (spec == nullptr) {
+        return false;
+    }
+
+    for (auto & impl : spec->impls) {
+        if (impl->get_state(seq_id, data)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector<uint8_t> & data) {
+    if (spec == nullptr) {
+        return;
+    }
+
+    for (auto & impl : spec->impls) {
+        impl->set_state(seq_id, data);
+    }
+}
+
 void common_speculative_print_stats(const common_speculative * spec) {
    if (spec == nullptr) {
        return;
--- a/common/speculative.h
+++ b/common/speculative.h
@ -68,6 +68,10 @@ void common_speculative_draft(common_speculative * spec);
 // informs the speculative context that n_accepted tokens were accepted by the target model
 void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);

+// (optional) get/set internal state
+bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data);
+void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector<uint8_t> & data);
+
 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);

--- a/conversion/init.py
+++ b/conversion/init.py
@ -46,6 +46,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "DbrxForCausalLM": "dbrx",
    "DeciLMForCausalLM": "deci",
    "DeepseekForCausalLM": "deepseek",
+    "DeepseekOCRForCausalLM": "deepseek",
    "DeepseekV2ForCausalLM": "deepseek",
    "DeepseekV3ForCausalLM": "deepseek",
    "DeepseekV32ForCausalLM": "deepseek",
@ -96,6 +97,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "GraniteMoeHybridForCausalLM": "granite",
    "GraniteMoeSharedForCausalLM": "granite",
    "GraniteSpeechForConditionalGeneration": "granite",
+    "GraniteSpeechPlusForConditionalGeneration": "granite",
    "Grok1ForCausalLM": "grok",
    "GrokForCausalLM": "grok",
    "GroveMoeForCausalLM": "grovemoe",
@ -123,6 +125,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "LLaDAModelLM": "llada",
    "LLaMAForCausalLM": "llama",
    "Lfm25AudioTokenizer": "lfm2",
+    "Lfm2BidirectionalModel": "lfm2",
    "Lfm2ForCausalLM": "lfm2",
    "Lfm2Model": "lfm2",
    "Lfm2MoeForCausalLM": "lfm2",
@ -133,6 +136,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "LlamaModel": "llama",
    "Eagle3DraftModel": "llama",
    "Eagle3Speculator": "llama",
+    "Eagle3LlamaForCausalLM": "llama",
    "LlamaForCausalLMEagle3": "llama",
    "LlavaForConditionalGeneration": "llama",
    "LlavaStableLMEpochForCausalLM": "stablelm",
@ -231,6 +235,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "UMT5ForConditionalGeneration": "t5",
    "UMT5Model": "t5",
    "UltravoxModel": "ultravox",
+    "UnlimitedOCRForCausalLM": "deepseek",
    "VLlama3ForCausalLM": "llama",
    "VoxtralForConditionalGeneration": "llama",
    "WavTokenizerDec": "wavtokenizer",
@ -261,6 +266,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
    "GlmasrModel": "ultravox",
    "Granite4VisionForConditionalGeneration": "granite",
    "GraniteSpeechForConditionalGeneration": "granite",
+    "GraniteSpeechPlusForConditionalGeneration": "granite",
    "HunYuanVLForConditionalGeneration": "hunyuan",
    "Idefics3ForConditionalGeneration": "smolvlm",
    "InternVisionModel": "internvl",
@ -296,6 +302,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
    "StepVLForConditionalGeneration": "step3",
    "Step3p7ForConditionalGeneration": "step3",
    "UltravoxModel": "ultravox",
+    "UnlimitedOCRForCausalLM": "deepseek",
    "VoxtralForConditionalGeneration": "ultravox",
    "YoutuVLForConditionalGeneration": "youtuvl",
 }
--- a/conversion/bailingmoe.py
+++ b/conversion/bailingmoe.py
@ -126,7 +126,7 @@ class BailingMoeV2Model(TextModel):
        if (rope_dim := hparams.get("head_dim")) is None:
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]

-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
--- a/conversion/base.py
+++ b/conversion/base.py
@ -1119,8 +1119,10 @@ class TextModel(ModelBase):

        rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
        local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
+        partial_rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"], optional=True)
+        original_max_position_embeddings = self.find_hparam(["original_max_position_embeddings"], optional=True)

-        # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
+        # Ensure global params are mirrored in rope_parameters
        if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
            if local_rope_theta is not None:
                self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
@ -1128,6 +1130,10 @@ class TextModel(ModelBase):
                self.rope_parameters["rope_theta"] = rope_theta
            if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
                self.rope_parameters["rope_type"] = rope_type
+            if "partial_rotary_factor" not in self.rope_parameters and partial_rotary_factor is not None:
+                self.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
+            if "original_max_position_embeddings" not in self.rope_parameters and original_max_position_embeddings is not None:
+                self.rope_parameters["original_max_position_embeddings"] = original_max_position_embeddings

    @classmethod
    def __init_subclass__(cls):
--- a/conversion/chatglm.py
+++ b/conversion/chatglm.py
@ -148,7 +148,7 @@ class ChatGLMModel(TextModel):
            rope_dim = self.hparams["attention_dim"]
        else:
            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
        self.gguf_writer.add_add_bos_token(False)
        rope_freq = 10000
        if "rope_ratio" in self.hparams:
--- a/conversion/deci.py
+++ b/conversion/deci.py
@ -161,7 +161,7 @@ class DeciModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
--- a/conversion/deepseek.py
+++ b/conversion/deepseek.py
@ -14,7 +14,7 @@ from .base import MmprojModel, ModelBase, TextModel, gguf, logger
 from .qwen import QwenModel


-@ModelBase.register("DeepseekOCRForCausalLM")
+@ModelBase.register("DeepseekOCRForCausalLM", "UnlimitedOCRForCausalLM")
 class DeepseekOCRVisionModel(MmprojModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -205,6 +205,8 @@ class DeepseekModel(TextModel):
@ModelBase.register(
    "DeepseekV2ForCausalLM",
    "DeepseekV3ForCausalLM",
+    "DeepseekOCRForCausalLM",
+    "UnlimitedOCRForCausalLM",
    "KimiVLForConditionalGeneration",
    "KimiK25ForConditionalGeneration",
    "YoutuForCausalLM",
@ -224,7 +226,7 @@ class DeepseekV2Model(TextModel):
        self.origin_hf_arch = hparams.get('architectures', [None])[0]

        # special handling for Deepseek OCR
-        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
+        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM", "UnlimitedOCRForCausalLM"):
            self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
            self.gguf_writer.add_architecture()
@ -350,6 +352,12 @@ class DeepseekV2Model(TextModel):

        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

+        # Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA)
+        if is_ocr:
+            sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window")
+            if sliding_window:
+                self.gguf_writer.add_sliding_window(sliding_window)
+
        if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
            # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
            # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
--- a/conversion/exaone.py
+++ b/conversion/exaone.py
@ -24,7 +24,7 @@ class ExaoneModel(TextModel):

        assert (hparams["activation_function"] == "silu")

-        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
+        rotary_factor = self.rope_parameters.get("partial_rotary_factor")
        rotary_factor = rotary_factor if rotary_factor is not None else 1.0
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))

@ -39,7 +39,7 @@ class ExaoneModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@ -104,7 +104,7 @@ class Exaone4Model(TextModel):
                factor = rope_params.get("factor", 16.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
--- a/conversion/gemma.py
+++ b/conversion/gemma.py
@ -693,7 +693,7 @@ class Gemma4Model(Gemma3Model):
            self.gguf_writer.add_head_count_kv(value_arr)

        # handle n_rot differently for global vs swa layers
-        partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
+        partial_rotary_factor_swa = self.rope_parameters.get("partial_rotary_factor", 1.0)
        n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
        n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
        self.gguf_writer.add_rope_dimension_count(n_rot_full)
--- a/conversion/glm.py
+++ b/conversion/glm.py
@ -124,7 +124,7 @@ class Glm4MoeModel(TextModel):
                self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
            )
        self.gguf_writer.add_rope_dimension_count(
-            int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
+            int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))
        )

        # MoE parameters - Use only routed expert count (shared experts handled separately)
@ -226,7 +226,7 @@ class GlmMoeDsaModel(DeepseekV2Model):
        super().set_gguf_parameters()

        rope_dim = self.hparams["qk_rope_head_dim"]
-        partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0)
+        partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 1.0)
        self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor))

        # NextN/MTP prediction layers
--- a/conversion/granite.py
+++ b/conversion/granite.py
@ -348,6 +348,34 @@ class GraniteSpeechMmprojModel(MmprojModel):
        yield from super().modify_tensors(data_torch, name, bid)


+@ModelBase.register("GraniteSpeechPlusForConditionalGeneration")
+class GraniteSpeechPlusMmprojModel(GraniteSpeechMmprojModel):
+    """Conversion for GraniteSpeechPlus - extends GraniteSpeech with feature layer concatenation"""
+    has_vision_encoder = False
+    has_audio_encoder = True
+
+    def set_gguf_parameters(self):
+        assert self.hparams_audio is not None
+        super().set_gguf_parameters()
+
+        # Add feature_layer if present in encoder config
+        if feature_layers := self.hparams_audio.get("cat_hidden_layers"):
+            self.gguf_writer.add_audio_feature_layers(feature_layers)
+            logger.info(f"gguf: audio feature_layers = {feature_layers}")
+
+            # Validate projector dimension matches concatenated encoder output
+            hidden_dim = self.hparams_audio["hidden_dim"]
+            expected_dim = hidden_dim * (len(feature_layers) + 1)
+            projector_dim = self.global_config["projector_config"]["encoder_hidden_size"]
+
+            if projector_dim != expected_dim:
+                raise ValueError(
+                    f"Projector encoder_hidden_size ({projector_dim}) does not match "
+                    f"expected concatenated dimension ({expected_dim}). "
+                    f"Expected: hidden_dim ({hidden_dim}) * (len(feature_layers) + 1) = {expected_dim}"
+                )
+
+
@ModelBase.register("Granite4VisionForConditionalGeneration")
 class Granite4VisionMmprojModel(MmprojModel):
    has_vision_encoder = True
--- a/conversion/lfm2.py
+++ b/conversion/lfm2.py
@ -64,11 +64,17 @@ class LFM2Model(TextModel):
        yield from super().modify_tensors(data_torch, name, bid)


-@ModelBase.register("Lfm2Model")
+@ModelBase.register("Lfm2Model", "Lfm2BidirectionalModel")
 class LFM2ColBertModel(LFM2Model):
    model_arch = gguf.MODEL_ARCH.LFM2
    dense_tensor_name = "dense_2"

+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        if self.hf_arch == "Lfm2BidirectionalModel":
+            self.gguf_writer.add_causal_attention(False)
+        self._try_set_pooling_type()
+
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if not name.startswith(self.dense_tensor_name):
            name = "model." + name
@ -76,10 +82,11 @@ class LFM2ColBertModel(LFM2Model):
        yield from super().modify_tensors(data_torch, name, bid)

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        # dense tensor is stored in a separate safetensors file
+        # optional dense tensor is stored in a separate safetensors file
        from safetensors.torch import load_file
        tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
-        assert tensors_file.is_file()
+        if not tensors_file.is_file():
+            return
        tensor = load_file(tensors_file)["linear.weight"]
        self.gguf_writer.add_embedding_length_out(tensor.shape[0])
        yield f"{self.dense_tensor_name}.weight", tensor.clone()
--- a/conversion/llama.py
+++ b/conversion/llama.py
@ -23,6 +23,7 @@ from .base import ModelBase, TextModel, gguf, logger
    "LlavaForConditionalGeneration",
    "VoxtralForConditionalGeneration",
    "LlamaForCausalLMEagle3",
+    "Eagle3LlamaForCausalLM",
    "Eagle3Speculator",
    "Eagle3DraftModel",
    "IQuestCoderForCausalLM",
@ -289,7 +290,7 @@ class LlamaModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
--- a/conversion/mimo.py
+++ b/conversion/mimo.py
@ -154,7 +154,7 @@ class MimoV2Model(TextModel):
        self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
        self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])

-        rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
+        rope_dim = int(self.hparams["head_dim"] * self.rope_parameters["partial_rotary_factor"])
        self.gguf_writer.add_rope_dimension_count(rope_dim)

        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
--- a/conversion/minicpm.py
+++ b/conversion/minicpm.py
@ -32,11 +32,9 @@ class MiniCPMModel(TextModel):
    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]

-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is not None:
-            long_factors = rope_scaling.get('long_factor', None)
-            short_factors = rope_scaling.get('short_factor', None)
-
+        long_factors = self.rope_parameters.get('long_factor')
+        short_factors = self.rope_parameters.get('short_factor')
+        if long_factors or short_factors:
            if long_factors is None or short_factors is None:
                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

@ -85,13 +83,11 @@ class MiniCPM3Model(TextModel):
        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is not None:
+        long_factors = self.rope_parameters.get('long_factor')
+        short_factors = self.rope_parameters.get('short_factor')
+        if long_factors or short_factors:
            rope_dims = self.hparams["qk_rope_head_dim"]

-            long_factors = rope_scaling.get('long_factor', None)
-            short_factors = rope_scaling.get('short_factor', None)
-
            if long_factors is None or short_factors is None:
                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

--- a/conversion/nemotron.py
+++ b/conversion/nemotron.py
@ -125,17 +125,18 @@ class NemotronModel(TextModel):
        self.gguf_writer.add_layer_norm_eps(f_norm_eps)

        # * Partial RoPE
-        rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
+        rot_pct = self.rope_parameters["partial_rotary_factor"]
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)

        # * RopeScaling for Nemotron
-        if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
+        factor = self.hparams.get("factor") or self.rope_parameters.get("factor")
+        if factor is None:
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
        else:
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
+            self.gguf_writer.add_rope_scaling_factor(factor)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
--- a/conversion/phi.py
+++ b/conversion/phi.py
@ -18,7 +18,7 @@ class Phi2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.PHI2

    def set_gguf_parameters(self):
-        rot_pct = self.find_hparam(["partial_rotary_factor"])
+        rot_pct = self.rope_parameters["partial_rotary_factor"]
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])

@ -149,8 +149,8 @@ class Phi3MiniModel(TextModel):
        n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
        rms_eps = self.find_hparam(["rms_norm_eps"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
-        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
+        rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
        rope_dims = int(rot_pct * n_embd) // n_head

        self.gguf_writer.add_context_length(max_pos_embds)
@ -174,18 +174,19 @@ class Phi3MiniModel(TextModel):
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
-        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
+        rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
        rope_dims = int(rot_pct * n_embd) // n_head

        # write rope scaling for long context (128k) model
-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is None:
+        long_factors = self.rope_parameters.get('long_factor')
+        short_factors = self.rope_parameters.get('short_factor')
+        if not long_factors:
            return

        scale = max_pos_embds / orig_max_pos_embds

-        rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
+        rope_scaling_type = self.rope_parameters.get('rope_type', '').lower()
        if len(rope_scaling_type) == 0:
            raise KeyError('Missing the required key rope_scaling.type')

@ -198,9 +199,6 @@ class Phi3MiniModel(TextModel):

        self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)

-        long_factors = rope_scaling.get('long_factor', None)
-        short_factors = rope_scaling.get('short_factor', None)
-
        if long_factors is None or short_factors is None:
            raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

--- a/conversion/qwen.py
+++ b/conversion/qwen.py
@ -280,7 +280,7 @@ class Qwen3NextModel(Qwen2MoeModel):
        self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4))
        if (rope_dim := self.hparams.get("head_dim")) is None:
            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.25)))

    @classmethod
    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
--- a/conversion/stablelm.py
+++ b/conversion/stablelm.py
@ -28,7 +28,7 @@ class StableLMModel(TextModel):
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(self.block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
+        rotary_factor = self.rope_parameters["partial_rotary_factor"]
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
--- a/conversion/step3.py
+++ b/conversion/step3.py
@ -314,7 +314,7 @@ class Step35Model(TextModel):
        factor = float(rope_params.get("factor", 8.0))
        low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
        high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
-        old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))
+        old_context_len = int(rope_params.get("original_max_position_embeddings", 8192))

        low_freq_wavelen = old_context_len / low_freq_factor
        high_freq_wavelen = old_context_len / high_freq_factor
--- a/docs/android.md
+++ b/docs/android.md
@ -29,7 +29,7 @@ With Termux, you can install and run `llama.cpp` as if the environment were Linu

 ```
 $ apt update && apt upgrade -y
-$ apt install git cmake
+$ apt install git cmake libandroid-spawn
 ```

 Then, follow the [build instructions](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md), specifically for CMake.
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@ -413,6 +413,15 @@ In two device selection modes, the default SYCL backend is level_zero, you can c
 |------------------|----------------------------------------|
 | Single device    | --split-mode none --main-gpu DEVICE_ID |
 | Multiple devices | --split-mode layer (default)           |
+| Multiple devices | --split-mode tensor (tensor parallelism) |
+
+`--split-mode tensor` (tensor parallelism) shards each layer across the selected
+GPUs. It requires flash attention, which is auto-enabled when `--flash-attn` is
+left at its default `auto`, so `--split-mode tensor` works out of the box.
+Passing `--flash-attn off` together with `--split-mode tensor` is rejected at
+context creation. The default `f16` KV cache is recommended. Tensor parallelism
+is currently optimized for 2 GPUs; other device counts fall back to a generic
+all-reduce.

 Examples:

@ -715,6 +724,15 @@ In two device selection modes, the default SYCL backend is level_zero, you can c
 |------------------|----------------------------------------|
 | Single device    | --split-mode none --main-gpu DEVICE_ID |
 | Multiple devices | --split-mode layer (default)           |
+| Multiple devices | --split-mode tensor (tensor parallelism) |
+
+`--split-mode tensor` (tensor parallelism) shards each layer across the selected
+GPUs. It requires flash attention, which is auto-enabled when `--flash-attn` is
+left at its default `auto`, so `--split-mode tensor` works out of the box.
+Passing `--flash-attn off` together with `--split-mode tensor` is rejected at
+context creation. The default `f16` KV cache is recommended. Tensor parallelism
+is currently optimized for 2 GPUs; other device counts fall back to a generic
+all-reduce.

 Examples:

--- a/docs/backend/snapdragon/CMakeUserPresets.json
+++ b/docs/backend/snapdragon/CMakeUserPresets.json
@ -24,7 +24,6 @@
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
-            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
            "LLAMA_OPENSSL":    "OFF"
        }
    },
@ -47,7 +46,6 @@
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
-            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
            "LLAMA_OPENSSL":    "OFF"
        }
    },
@ -73,7 +71,6 @@
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "OFF",
            "GGML_HEXAGON":     "ON",
-            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
            "LLAMA_OPENSSL":    "OFF"
        }
    },
--- a/docs/speculative.md
+++ b/docs/speculative.md
@ -13,6 +13,45 @@ The `llama-server` application supports several implementations of speculative d
 A much smaller model (called the _draft model_) generates drafts.
 A draft model is the most used approach in speculative decoding.

+### EAGLE-3 (`draft-eagle3`)
+
+EAGLE-3 uses a small draft model that reads the target model's hidden states to predict the next tokens, so it
+reaches higher acceptance than a standalone draft model of the same size. The draft is a one-layer transformer
+trained for a specific target model; it shares the target model's tokenizer and, optionally, uses a reduced draft
+vocabulary with its own `lm_head`, which is mapped back using a `d2t` table.
+
+Convert the EAGLE-3 checkpoint with `--target-model-dir` so it inherits the target's tokenizer and the layer
+indices to read. Both the SpecForge `LlamaForCausalLMEagle3` and the vLLM/AngelSlim `Eagle3LlamaForCausalLM`
+checkpoint formats are supported (for example [`AngelSlim/Qwen3-4B_eagle3`](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3)
+for `Qwen/Qwen3-4B`):
+
+```bash
+python convert_hf_to_gguf.py AngelSlim/Qwen3-4B_eagle3 \
+    --target-model-dir Qwen/Qwen3-4B --outtype bf16 --outfile Qwen3-4B-eagle3.gguf
+
+llama-server -m Qwen3-4B.gguf -md Qwen3-4B-eagle3.gguf --spec-type draft-eagle3
+```
+
+Supported EAGLE-3 draft models include:
+
+- [yuhuili/EAGLE3-LLaMA3.1-Instruct-8B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.1-Instruct-8B)
+- [yuhuili/EAGLE3-LLaMA3.3-Instruct-70B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.3-Instruct-70B)
+- [RedHatAI/gemma-4-31B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-31B-it-speculator.eagle3)
+- [RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3)
+- [Tengyunw/qwen3_8b_eagle3](https://huggingface.co/Tengyunw/qwen3_8b_eagle3)
+- [Tengyunw/qwen3_30b_moe_eagle3](https://huggingface.co/Tengyunw/qwen3_30b_moe_eagle3)
+- [AngelSlim/Qwen3-1.7B_eagle3](https://huggingface.co/AngelSlim/Qwen3-1.7B_eagle3)
+- [AngelSlim/Qwen3-4B_eagle3](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3)
+- [AngelSlim/Qwen3-8B_eagle3](https://huggingface.co/AngelSlim/Qwen3-8B_eagle3)
+- [AngelSlim/Qwen3-14B_eagle3](https://huggingface.co/AngelSlim/Qwen3-14B_eagle3)
+- [AngelSlim/Qwen3-32B_eagle3](https://huggingface.co/AngelSlim/Qwen3-32B_eagle3)
+- [AngelSlim/Qwen3-a3B_eagle3](https://huggingface.co/AngelSlim/Qwen3-a3B_eagle3)
+- [RedHatAI/gpt-oss-20b-speculator.eagle3](https://huggingface.co/RedHatAI/gpt-oss-20b-speculator.eagle3)
+- [lmsys/EAGLE3-gpt-oss-120b-bf16](https://huggingface.co/lmsys/EAGLE3-gpt-oss-120b-bf16)
+- [nvidia/gpt-oss-120b-Eagle3-long-context](https://huggingface.co/nvidia/gpt-oss-120b-Eagle3-long-context)
+
+For the full and up-to-date list of supported models, see #18039.
+
 ### n-gram Cache (`ngram-cache`)

 An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences.
@ -108,7 +147,7 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
 ### General Speculative Parameters

 ```
--spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
+--spec-type [none|draft-simple|draft-eagle3|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
                                        comma-separated list of types of speculative decoding to use
                                        (default: none)
                                        (env: LLAMA_ARG_SPEC_TYPE)
@ -247,6 +286,7 @@ Specifies a comma-separated list of speculative decoding types to use.
 |------|-------------|
 | `none` | No speculative decoding (default) |
 | `draft-simple` | Use a simple draft model for speculation |
+| `draft-eagle3` | Use an EAGLE-3 draft model that reads the target's hidden states |
 | `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model |
 | `ngram-cache` | Use n-gram cache lookup |
 | `ngram-simple` | Use simple n-gram pattern matching |
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -198,18 +198,18 @@ class BuiltinRule:
 SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'

 PRIMITIVE_RULES = {
-    'boolean'      : BuiltinRule('("true" | "false") space', []),
+    'boolean'      : BuiltinRule('("true" | "false")', []),
    'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
    'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
-    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
-    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
+    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?', ['integral-part', 'decimal-part']),
+    'integer'      : BuiltinRule('("-"? integral-part)', ['integral-part']),
    'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
-    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
-    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
+    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? space "}"', ['string', 'value']),
+    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? space "]"', ['value']),
+    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\""', []),
    'char'         : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
-    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
-    'null'         : BuiltinRule('"null" space', []),
+    'string'       : BuiltinRule(r'"\"" char* "\""', ['char']),
+    'null'         : BuiltinRule('"null"', []),
 }

 # TODO: support "uri", "email" string formats
@ -217,9 +217,9 @@ STRING_FORMAT_RULES = {
    'date'            : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
    'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
-    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
-    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
-    'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
+    'date-string'     : BuiltinRule('"\\"" date "\\""', ['date']),
+    'time-string'     : BuiltinRule('"\\"" time "\\""', ['time']),
+    'date-time-string': BuiltinRule('"\\"" date-time "\\""', ['date-time']),
 }

 DOTALL = '[\\U00000000-\\U0010FFFF]'
@ -319,7 +319,7 @@ class SchemaConverter:
                out.append(f'[^"{"".join(rejects)}] {char_rule}*')
        visit(trie)

-        out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
+        out.append(f' ){"" if trie.is_end_of_string else "?"} ["]')
        return ''.join(out)

    def _add_rule(self, name, rule):
@ -549,7 +549,7 @@ class SchemaConverter:
        return self._add_rule(
            name,
            to_rule(transform()) if self._raw_pattern \
-                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
+                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"")


    def _resolve_ref(self, ref):
@ -580,10 +580,10 @@ class SchemaConverter:
            return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))

        elif 'const' in schema:
-            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
+            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']))

        elif 'enum' in schema:
-            rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
+            rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ')'
            return self._add_rule(rule_name, rule)

        elif schema_type in (None, 'object') and \
@ -624,7 +624,7 @@ class SchemaConverter:
                    enum_intersection &= s

                if enum_intersection:
-                    rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space'
+                    rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ')'
                    return self._add_rule(rule_name, rule)

            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
@ -638,12 +638,12 @@ class SchemaConverter:
                    ' "," space '.join(
                        self.visit(item, f'{name}{"-" if name else ""}tuple-{i}')
                        for i, item in enumerate(items)) +
-                    ' "]" space')
+                    ' space "]"')
            else:
                item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
                min_items = schema.get("minItems", 0)
                max_items = schema.get("maxItems")
-                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
+                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' space "]"')

        elif schema_type in (None, 'string') and 'pattern' in schema:
            return self._visit_pattern(schema['pattern'], rule_name)
@ -663,7 +663,7 @@ class SchemaConverter:
            min_len = schema.get('minLength', 0)
            max_len = schema.get('maxLength')

-            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
+            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\""')

        elif schema_type in (None, 'integer') and \
                ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
@ -680,7 +680,7 @@ class SchemaConverter:

            out = ["("]
            _generate_min_max_int(min_value, max_value, out)
-            out.append(") space")
+            out.append(")")
            return self._add_rule(rule_name, ''.join(out))

        elif (schema_type == 'object') or (len(schema) == 0):
@ -765,7 +765,7 @@ class SchemaConverter:
                rule += ' )'
            rule += ' )?'

-        rule += ' "}" space'
+        rule += ' space "}"'

        return rule

--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 15)
-set(GGML_VERSION_PATCH 1)
+set(GGML_VERSION_PATCH 2)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@ -266,7 +266,6 @@ set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                            "ggml: OpenCL API version to target")

 option(GGML_HEXAGON                         "ggml: enable Hexagon backend"                    OFF)
-set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")

 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
--- a/ggml/include/ggml-sycl.h
+++ b/ggml/include/ggml-sycl.h
@ -27,6 +27,14 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int de
 // split tensor buffer that splits matrices by rows across multiple devices
 GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);

+// Tensor parallelism (--split-mode tensor): comm_init/free/allreduce_tensor
+// trio queried by the meta-backend via ggml_backend_reg_get_proc_address.
+// See typedefs in ggml/include/ggml-backend.h. Mirrors the CUDA backend's
+// pattern (ggml_backend_cuda_comm_*).
+GGML_BACKEND_API void * ggml_backend_sycl_comm_init(ggml_backend_t * backends, size_t n_backends);
+GGML_BACKEND_API void   ggml_backend_sycl_comm_free(void * comm_ctx);
+GGML_BACKEND_API bool   ggml_backend_sycl_comm_allreduce_tensor(void * comm_ctx, struct ggml_tensor ** tensors);
+
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);

--- a/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@ -2417,15 +2417,14 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
            // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
            GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);

-            parallel_for_ggml(params, n_batch, [&](int begin, int end) {
-                for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
+            parallel_for_ggml(params, n_batch * M, [&](int begin, int end) {
+                for (int idx = begin; idx < end; ++idx) {
+                    int batch_idx = idx / M;
+                    int m         = idx % M;
                    int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
                    const float * A_data = (const float *)((const char *)src1->data + src1_offset);
                    char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A;
-
-                    for (int m = 0; m < M; ++m) {
-                        from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
-                    }
+                    from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
                }
            });
        });
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@ -2345,7 +2345,7 @@ class tinyBLAS_Q0_PPC {
            else if (n_aligned % 16 == 0) nc = 16;
            else                          nc = 8;
        }
-        bool can_use_tiled = n_aligned > 0 && (m % mc == 0) && (k % kc == 0);
+        bool can_use_tiled = n_aligned > 0 && (m % mc == 0);
        if (can_use_tiled) {
            matmul_tiled(m, n_aligned, mc, nc, kc);
            if (n > n_aligned) {
@ -3063,13 +3063,14 @@ class tinyBLAS_Q0_PPC {
            int64_t ii = (job / xtiles) * mc;
            int64_t jj = (job % xtiles) * nc;
            for (int64_t kk = 0; kk < k; kk += kc) {
+                int64_t k_cur = MIN(kc, k - kk);
                if constexpr(is_Ablock_q4) {
-                    packNormal_q4_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack);
+                    packNormal_q4_fp16(A + ii * lda + kk, lda, mc, k_cur, (uint8_t *)A_pack);
                } else {
-                    packNormal_q8_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack);
+                    packNormal_q8_fp16(A + ii * lda + kk, lda, mc, k_cur, (uint8_t *)A_pack);
                }
-                packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, kc, (uint8_t *)B_pack);
-                KERNEL_Q0(ii, jj, mc, nc, kc, kk, A_pack, B_pack);
+                packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, k_cur, (uint8_t *)B_pack);
+                KERNEL_Q0(ii, jj, mc, nc, k_cur, kk, A_pack, B_pack);
            }
        }
    }
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@ -3688,8 +3688,6 @@ static void ggml_compute_forward_norm_f32(

    GGML_ASSERT(ggml_are_same_shape(src0, dst));

-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
    const int ith = params->ith;
    const int nth = params->nth;

@ -3703,25 +3701,49 @@ static void ggml_compute_forward_norm_f32(
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                const char * x = (const char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                char * y = (char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3;

-                float sum = 0.0;
-                ggml_vec_sum_f32(ne00, &sum, x);
-                float mean = sum/ne00;
+                if (nb00 == sizeof(float) && nb0 == sizeof(float)) {
+                    const float * xf = (const float *) x;

-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-                float variance = 0;
+                    float sum = 0.0;
+                    ggml_vec_sum_f32(ne00, &sum, xf);
+                    float mean = sum/ne00;
+
+                    float * yf = (float *) y;
+                    float variance = 0;

 #ifdef GGML_USE_ACCELERATE
-                mean = -mean;
-                vDSP_vsadd(x, 1, &mean, y, 1, ne00);
-                vDSP_measqv(y, 1, &variance, ne00);
+                    mean = -mean;
+                    vDSP_vsadd(xf, 1, &mean, yf, 1, ne00);
+                    vDSP_measqv(yf, 1, &variance, ne00);
 #else
-                variance = ggml_vec_cvar_f32(ne00, y, x, mean);
+                    variance = ggml_vec_cvar_f32(ne00, yf, xf, mean);
 #endif //GGML_USE_ACCELERATE

-                const float scale = 1.0f/sqrtf(variance + eps);
-                ggml_vec_scale_f32(ne00, y, scale);
+                    const float scale = 1.0f/sqrtf(variance + eps);
+                    ggml_vec_scale_f32(ne00, yf, scale);
+                } else {
+                    float sum = 0.0;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        sum += *(const float *) (x + i00*nb00);
+                    }
+                    const float mean = sum/ne00;
+
+                    float variance = 0.0f;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const float v = *(const float *) (x + i00*nb00) - mean;
+                        *(float *) (y + i00*nb0) = v;
+                        variance += v * v;
+                    }
+                    variance /= ne00;
+
+                    const float scale = 1.0f/sqrtf(variance + eps);
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        *(float *) (y + i00*nb0) *= scale;
+                    }
+                }
            }
        }
    }
@ -4142,8 +4164,6 @@ static void ggml_compute_forward_l2_norm_f32(

    GGML_ASSERT(ggml_are_same_shape(src0, dst));

-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
    const int ith = params->ith;
    const int nth = params->nth;

@ -4158,20 +4178,27 @@ static void ggml_compute_forward_l2_norm_f32(
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                const char * x = (const char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;

                ggml_float sum = 0.0;
                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum += (ggml_float)(x[i00] * x[i00]);
+                    const float xi = *(const float *) (x + i00*nb00);
+                    sum += (ggml_float)(xi * xi);
                }

-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                memcpy(y, x, ne00 * sizeof(float));
-
                const float scale = 1.0f/fmaxf(sqrtf(sum), eps);

-                ggml_vec_scale_f32(ne00, y, scale);
+                char * y = (char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3;
+
+                if (nb00 == sizeof(float) && nb0 == sizeof(float)) {
+                    memcpy(y, x, ne00 * sizeof(float));
+                    ggml_vec_scale_f32(ne00, (float *) y, scale);
+                } else {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const float xi = *(const float *) (x + i00*nb00);
+                        *(float *) (y + i00*nb0) = xi * scale;
+                    }
+                }
            }
        }
    }
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@ -34,26 +34,26 @@ template <float (*bin_op)(const float, const float),
 static __global__ void k_bin_bcast(const src0_t *         src0,
                                   const src1_t *         src1,
                                   dst_t *                dst,
-                                   const int              ne0,
-                                   const int              ne1,
-                                   const int              ne2,
+                                   const uint32_t         ne0,
+                                   const uint32_t         ne1,
+                                   const uint32_t         ne2,
                                   const uint3            ne3,
                                   const uint3            ne10,
                                   const uint3            ne11,
                                   const uint3            ne12,
                                   const uint3            ne13,
-                                 /*const int              s0,*/
-                                   const int              s1,
-                                   const int              s2,
-                                   const int              s3,
-                                   const int              s00,
-                                   const int              s01,
-                                   const int              s02,
-                                   const int              s03,
-                                   const int              s10,
-                                   const int              s11,
-                                   const int              s12,
-                                   const int              s13,
+                                 /*const uint32_t         s0,*/
+                                   const uint32_t         s1,
+                                   const uint32_t         s2,
+                                   const uint32_t         s3,
+                                   const uint32_t         s00,
+                                   const uint32_t         s01,
+                                   const uint32_t         s02,
+                                   const uint32_t         s03,
+                                   const uint32_t         s10,
+                                   const uint32_t         s11,
+                                   const uint32_t         s12,
+                                   const uint32_t         s13,
                                   src1_ptrs... src1s) {
    ggml_cuda_pdl_lc();
    const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x;
@ -61,7 +61,7 @@ static __global__ void k_bin_bcast(const src0_t *         src0,
    const uint32_t i2  = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
    const uint32_t i3  = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);

-    if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
+    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) {
        return;
    }

@ -69,25 +69,32 @@ static __global__ void k_bin_bcast(const src0_t *         src0,
    const uint32_t i12 = fastmodulo(i2, ne12);
    const uint32_t i13 = fastmodulo(i3, ne13);

-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
+    const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
+    const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
+    const size_t i_dst  = size_t( i3)*s3  + size_t( i2)*s2  + size_t( i1)*s1;

    const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
    dst_t * dst_row = dst + i_dst;

+    const uint32_t s0 = blockDim.x * gridDim.x;
+
    ggml_cuda_pdl_sync();
-    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) {
+    for (uint32_t i0 = i0s; i0 < ne0; i0 += s0) {
        const uint32_t i10 = fastmodulo(i0, ne10);

-        float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
+        float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
        if constexpr (sizeof...(src1_ptrs) > 0) {
-            result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
+            result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
        } else {
-            result = bin_op(result, (float)src1[i_src1 + i10*s10]);
+            result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
        }

        dst_row[i0] = (dst_t) result;
+
+        // protect i0 from overflow
+        if (ne0 - i0 <= s0) {
+           break;
+        }
    }
 }

@ -110,19 +117,19 @@ static __global__ void k_bin_bcast_unravel(const src0_t *         src0,
                                           const uint3            ne12,
                                           const uint3            ne13,
                                         /*const int              s0,*/
-                                           const int              s1,
-                                           const int              s2,
-                                           const int              s3,
-                                           const int              s00,
-                                           const int              s01,
-                                           const int              s02,
-                                           const int              s03,
-                                           const int              s10,
-                                           const int              s11,
-                                           const int              s12,
-                                           const int              s13,
+                                           const uint32_t         s1,
+                                           const uint32_t         s2,
+                                           const uint32_t         s3,
+                                           const uint32_t         s00,
+                                           const uint32_t         s01,
+                                           const uint32_t         s02,
+                                           const uint32_t         s03,
+                                           const uint32_t         s10,
+                                           const uint32_t         s11,
+                                           const uint32_t         s12,
+                                           const uint32_t         s13,
                                           src1_ptrs... src1s) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+    const uint32_t i  = blockDim.x*blockIdx.x + threadIdx.x;

    const uint32_t i3 = fastdiv(i, prod_012);
    const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01);
@ -133,25 +140,25 @@ static __global__ void k_bin_bcast_unravel(const src0_t *         src0,
        return;
    }

-    const int i11 = fastmodulo(i1, ne11);
-    const int i12 = fastmodulo(i2, ne12);
-    const int i13 = fastmodulo(i3, ne13);
+    const uint32_t i11 = fastmodulo(i1, ne11);
+    const uint32_t i12 = fastmodulo(i2, ne12);
+    const uint32_t i13 = fastmodulo(i3, ne13);

-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
+    const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
+    const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
+    const size_t i_dst  = size_t( i3)*s3  + size_t( i2)*s2  + size_t( i1)*s1;

    const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
    dst_t * dst_row = dst + i_dst;

-    const int i10 = fastmodulo(i0, ne10);
+    const uint32_t i10 = fastmodulo(i0, ne10);

    ggml_cuda_pdl_sync();
-    float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
+    float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
    if constexpr (sizeof...(src1_ptrs) > 0) {
-        result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
+        result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
    } else {
-        result = bin_op(result, (float)src1[i_src1 + i10*s10]);
+        result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
    }

    dst_row[i0] = (dst_t) result;
@ -248,6 +255,31 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
        size_t s02 = nb02 / sizeof(src0_t);
        size_t s03 = nb03 / sizeof(src0_t);

+        GGML_ASSERT(ne0 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(ne1 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(ne2 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(ne3 <= std::numeric_limits<uint32_t>::max());
+
+      //GGML_ASSERT(s0  <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s1  <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s2  <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s3  <= std::numeric_limits<uint32_t>::max());
+
+        GGML_ASSERT(s00 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s01 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s02 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s03 <= std::numeric_limits<uint32_t>::max());
+
+        GGML_ASSERT(s10 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s11 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s12 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s13 <= std::numeric_limits<uint32_t>::max());
+
+        GGML_ASSERT(cne1[0] <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(cne1[1] <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(cne1[2] <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(cne1[3] <= std::numeric_limits<uint32_t>::max());
+
        GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
        GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
        GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
@ -263,6 +295,8 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
        GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
        GGML_ASSERT(nb13 % sizeof(src1_t) == 0);

+        GGML_ASSERT(ne2 * ne3 <= std::numeric_limits<unsigned int>::max());
+
        const int block_size = 128;

        int64_t hne0 = std::max(ne0 / 2LL, 1LL);
@ -281,7 +315,13 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
        const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);

        if (block_nums.z > 65535 || block_nums.y > 65535) {
-            int         block_num  = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
+            int64_t     block_num   = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
+
+            GGML_ASSERT(block_num              <= std::numeric_limits<uint32_t>::max());
+            GGML_ASSERT(block_num * block_size <= std::numeric_limits<uint32_t>::max());
+            GGML_ASSERT(ne0 * ne1              <= std::numeric_limits<uint32_t>::max());
+            GGML_ASSERT(ne0 * ne1 * ne2        <= std::numeric_limits<uint32_t>::max());
+
            const uint3 prod_012    = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
            const uint3 prod_01     = init_fastdiv_values((uint32_t) (ne0 * ne1));
            const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0);
@ -298,6 +338,10 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
                    s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
            }
        } else {
+            GGML_ASSERT(int64_t(block_nums.x) * block_dims.x <= std::numeric_limits<uint32_t>::max());
+            GGML_ASSERT(int64_t(block_nums.y) * block_dims.y <= std::numeric_limits<uint32_t>::max());
+            GGML_ASSERT(int64_t(block_nums.z) * block_dims.z <= std::numeric_limits<uint32_t>::max());
+
            const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3);
            {
                const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
--- a/ggml/src/ggml-cuda/col2im-1d.cu
+++ b/ggml/src/ggml-cuda/col2im-1d.cu
@ -0,0 +1,81 @@
+#include "col2im-1d.cuh"
+#include "convert.cuh"
+
+// col2im_1d: scatter-add GEMM columns to 1D signal (gather approach)
+// columns: [K*OC, T_in]  ->  output: [T_out, OC]
+// Supports F32, F16, BF16 data with F32 accumulator.
+
+template <typename T>
+static __global__ void col2im_1d_kernel(
+        const T * __restrict__ col,
+        T       * __restrict__ dst,
+        const int T_in, const uint3 T_out_fd,
+        const int OC, const int K, const int K_OC,
+        const int s0, const int p0, const int total) {
+
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= total) return;
+
+    // dst layout: [T_out, OC], ne[0]=T_out fastest
+    const uint2 qr  = fast_div_modulo((uint32_t)idx, T_out_fd);  // qr.x = idx / T_out, qr.y = idx % T_out
+    const int oc    = (int)qr.x;
+    const int t_out = (int)qr.y;
+    const int t_abs = t_out + p0;  // absolute position in uncropped signal
+
+    // Gather: find all (t_in, k) where t_in*s + k == t_abs, 0 <= k < K
+    int t_in_min = (t_abs - K + s0) / s0;  // ceil((t_abs - K + 1) / s)
+    if (t_in_min < 0) t_in_min = 0;
+    int t_in_max = t_abs / s0;
+    if (t_in_max >= T_in) t_in_max = T_in - 1;
+
+    float sum = 0.0f;
+    for (int t_in = t_in_min; t_in <= t_in_max; t_in++) {
+        const int k = t_abs - t_in * s0;
+        // col layout: [K*OC, T_in], column index = oc * K + k
+        sum += ggml_cuda_cast<float>(col[(oc * K + k) + t_in * K_OC]);
+    }
+
+    dst[idx] = ggml_cuda_cast<T>(sum);
+}
+
+void ggml_cuda_op_col2im_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t OC = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+
+    const int K_OC = (int) src0->ne[0];
+    const int T_in = (int) src0->ne[1];
+    const int K    = K_OC / OC;
+    const int T_out = (int) dst->ne[0];
+
+    const uint3 T_out_fd = init_fastdiv_values((uint32_t)T_out);
+
+    const int total = T_out * OC;
+    const int block_size = 256;
+    const int num_blocks = (total + block_size - 1) / block_size;
+
+    switch (src0->type) {
+        case GGML_TYPE_F32: {
+            col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
+                (const float *)src0->data, (float *)dst->data,
+                T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
+        } break;
+        case GGML_TYPE_F16: {
+            col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
+                (const half *)src0->data, (half *)dst->data,
+                T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
+        } break;
+        case GGML_TYPE_BF16: {
+            col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
+                (const nv_bfloat16 *)src0->data, (nv_bfloat16 *)dst->data,
+                T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
+        } break;
+        default:
+            GGML_ABORT("col2im_1d: unsupported type");
+    }
+}
--- a/ggml/src/ggml-cuda/col2im-1d.cuh
+++ b/ggml/src/ggml-cuda/col2im-1d.cuh
@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_col2im_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -11,6 +11,7 @@
 #include "ggml-cuda/argsort.cuh"
 #include "ggml-cuda/binbcast.cuh"
 #include "ggml-cuda/clamp.cuh"
+#include "ggml-cuda/col2im-1d.cuh"
 #include "ggml-cuda/concat.cuh"
 #include "ggml-cuda/conv-transpose-1d.cuh"
 #include "ggml-cuda/conv2d.cuh"
@ -3051,6 +3052,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_CONV_TRANSPOSE_1D:
            ggml_cuda_op_conv_transpose_1d(ctx,dst);
            break;
+        case GGML_OP_COL2IM_1D:
+            ggml_cuda_op_col2im_1d(ctx, dst);
+            break;
        case GGML_OP_POOL_2D:
            ggml_cuda_op_pool2d(ctx, dst);
            break;
@ -5316,13 +5320,21 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                }
                return false;
            } break;
+        case GGML_OP_COL2IM_1D:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                return (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16 || src0_type == GGML_TYPE_BF16) &&
+                    op->type == src0_type &&
+                    ggml_is_contiguous(op->src[0]) &&
+                    ggml_is_contiguous(op);
+            } break;
        case GGML_OP_SILU_BACK:
            return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
            break;
        case GGML_OP_NORM:
        case GGML_OP_RMS_NORM:
        case GGML_OP_L2_NORM:
-            return true;
+            return ggml_is_contiguous_rows(op->src[0]);
        case GGML_OP_RMS_NORM_BACK:
            return ggml_is_contiguous(op->src[0]);
            break;
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@ -25,7 +25,6 @@ include(ExternalProject)
 option(GGML_HEXAGON_HTP_DEBUG  "ggml-hexagon: enable HTP debug output" OFF)
 option(GGML_HEXAGON_FA_EXP2_HF "ggml-hexagon: use FP16 exp2 polynomial in FA softmax instead of F32 exp round-trip" OFF)
 set(GGML_HEXAGON_HTP_CERT  "$ENV{HEXAGON_HTP_CERT}" CACHE PATH "ggml-hexagon: enable HTP library signing using certificate")
-set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)")

 add_library(htp_iface OBJECT
    ${CMAKE_CURRENT_BINARY_DIR}/htp_iface_stub.c)
@ -72,15 +71,12 @@ function(build_htp_skel V)
            -DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT}
            -DHEXAGON_TOOLS_ROOT=${HEXAGON_TOOLS_ROOT}
            -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
-            -DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}
            -DDSP_VERSION=${V}
            -DPREBUILT_LIB_DIR="toolv19_${V}")
    list(APPEND HTP_SKELS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so)
    set(HTP_SKELS ${HTP_SKELS} PARENT_SCOPE)
 endfunction()

-build_htp_skel(v68)
-build_htp_skel(v69)
 build_htp_skel(v73)
 build_htp_skel(v75)
 build_htp_skel(v79)
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
--- a/ggml/src/ggml-hexagon/htp-opnode.h
+++ b/ggml/src/ggml-hexagon/htp-opnode.h
@ -5,10 +5,12 @@
 #include "ggml-backend-impl.h"
 #include "ggml-common.h"

+#include <algorithm>
 #include <string>
 #include <vector>
 #include <stdio.h>
 #include "htp-ops.h"
+#include "htp/matmul-ops.h"

 struct htp_opnode {
    ggml_tensor * node = nullptr;
@ -17,6 +19,13 @@ struct htp_opnode {

    htp_op_code opcode = HTP_OP_INVALID;

+    std::vector<ggml_tensor *> extra_dsts;
+
+    int32_t kernel_params[HTP_OP_MAX_KERN_PARAMS] = {0};
+
+    htp_opnode(ggml_tensor * node = nullptr, std::vector<ggml_tensor *> fused = {}, htp_op_code opcode = HTP_OP_INVALID, std::vector<ggml_tensor *> extra_dsts = {})
+        : node(node), fused(std::move(fused)), opcode(opcode), extra_dsts(std::move(extra_dsts)) {}
+
    ggml_op op() const {
        return node->op;
    }
@ -25,6 +34,26 @@ struct htp_opnode {
        return fused.empty() ? node : fused.back();
    }

+    void add_fused(ggml_tensor * t, bool extra_dst = false) {
+        fused.push_back(t);
+        if (extra_dst) {
+            extra_dsts.push_back(t);
+        }
+    }
+
+    std::vector<const ggml_tensor *> get_outputs() const {
+        std::vector<const ggml_tensor *> res;
+        if (extra_dsts.empty()) {
+            res.push_back(dst());
+        } else {
+            res.push_back(node);
+            for (const auto * x : extra_dsts) {
+                res.push_back(x);
+            }
+        }
+        return res;
+    }
+
    const ggml_tensor * src0() const {
        return node->src[0];
    }
@ -37,10 +66,6 @@ struct htp_opnode {
        return ggml_op_is_empty(node->op);
    }

-    void add_fused(ggml_tensor * t) {
-        fused.push_back(t);
-    }
-
    bool stackable() const {
        switch (this->op()) {
            case GGML_OP_MUL_MAT:
@ -131,87 +156,117 @@ struct htp_opformat {
    char types[16 * GGML_MAX_SRC];
    char buffs[64 * GGML_MAX_SRC];
    char names[64 * GGML_MAX_SRC];
+    char kparams[128];

-    int format_tensor_dims(char * str, const struct ggml_tensor * t) {
+    int format_tensor_dims(char * str, size_t max_size, const struct ggml_tensor * t) {
        if (!t) {
-            return sprintf(str, "NONE");
+            return snprintf(str, max_size, "NONE");
        }
        if (t->ne[2] == 1 && t->ne[3] == 1) {
-            return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
+            return snprintf(str, max_size, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
        } else {
-            return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
+            return snprintf(str, max_size, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
        }
    }

-    void format_op_dims(char * str, const htp_opnode & node) {
+    void format_op_dims(char * str, size_t max_size, const htp_opnode & node) {
        char * p = str;
+        char * p_end = str + max_size;
        auto inputs = node.get_inputs();

        if (!inputs.empty()) {
-            p += format_tensor_dims(p, inputs[0]);
+            p += std::min((size_t)format_tensor_dims(p, p_end - p, inputs[0]), (size_t)(p_end - p));

            for (size_t i = 1; i < inputs.size(); i++) {
-                p += sprintf(p, " x ");
-                p += format_tensor_dims(p, inputs[i]);
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
+                }
+                if (p < p_end) {
+                    p += std::min((size_t)format_tensor_dims(p, p_end - p, inputs[i]), (size_t)(p_end - p));
+                }
            }

-            p += sprintf(p, " -> ");
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
+            }
        }

        char self[64];
-        format_tensor_dims(self, node.dst());
-        p += sprintf(p, "%s", self);
+        format_tensor_dims(self, sizeof(self), node.dst());
+        if (p < p_end) {
+            p += std::min((size_t)snprintf(p, p_end - p, "%s", self), (size_t)(p_end - p));
+        }
    }

-    int format_tensor_strides(char * str, const struct ggml_tensor * t) {
+    int format_tensor_strides(char * str, size_t max_size, const struct ggml_tensor * t) {
        if (!t) {
-            return sprintf(str, "NONE");
+            return snprintf(str, max_size, "NONE");
        }
        const char * c = ggml_is_contiguous(t) ? "" : "!";

        if (t->ne[2] == 1 && t->ne[3] == 1) {
-            return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
+            return snprintf(str, max_size, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
        } else {
-            return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
+            return snprintf(str, max_size, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
        }
    }

-    void format_op_strides(char * str, const htp_opnode & node) {
+    void format_op_strides(char * str, size_t max_size, const htp_opnode & node) {
        char * p = str;
+        char * p_end = str + max_size;
        auto inputs = node.get_inputs();

        if (!inputs.empty()) {
-            p += format_tensor_strides(p, inputs[0]);
+            p += std::min((size_t)format_tensor_strides(p, p_end - p, inputs[0]), (size_t)(p_end - p));

            for (size_t i = 1; i < inputs.size(); i++) {
-                p += sprintf(p, " x ");
-                p += format_tensor_strides(p, inputs[i]);
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
+                }
+                if (p < p_end) {
+                    p += std::min((size_t)format_tensor_strides(p, p_end - p, inputs[i]), (size_t)(p_end - p));
+                }
            }

-            p += sprintf(p, " -> ");
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
+            }
        }

        char self[64];
-        format_tensor_strides(self, node.dst());
-        p += sprintf(p, "%s", self);
+        format_tensor_strides(self, sizeof(self), node.dst());
+        if (p < p_end) {
+            p += std::min((size_t)snprintf(p, p_end - p, "%s", self), (size_t)(p_end - p));
+        }
    }

-    void format_op_types(char * str, const htp_opnode & node) {
+    void format_op_types(char * str, size_t max_size, const htp_opnode & node) {
        char * p = str;
+        char * p_end = str + max_size;
        auto inputs = node.get_inputs();

        if (!inputs.empty()) {
-            p += sprintf(p, "%s", inputs[0] ? ggml_type_name(inputs[0]->type) : "NONE");
-
-            for (size_t i = 1; i < inputs.size(); i++) {
-                p += sprintf(p, " x ");
-                p += sprintf(p, "%s", inputs[i] ? ggml_type_name(inputs[i]->type) : "NONE");
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[0] ? ggml_type_name(inputs[0]->type) : "NONE"), (size_t)(p_end - p));
            }

-            p += sprintf(p, " -> ");
+            for (size_t i = 1; i < inputs.size(); i++) {
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
+                }
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[i] ? ggml_type_name(inputs[i]->type) : "NONE"), (size_t)(p_end - p));
+                }
+            }
+
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
+            }
        }

-        p += sprintf(p, "%s", ggml_type_name(node.dst()->type));
+        if (p < p_end) {
+            p += std::min((size_t)snprintf(p, p_end - p, "%s", ggml_type_name(node.dst()->type)), (size_t)(p_end - p));
+        }
    }

    const char * tensor_buff_name(const struct ggml_tensor * t) {
@ -221,51 +276,102 @@ struct htp_opformat {
        return "NONE";
    }

-    void format_op_buffs(char * str, const htp_opnode & node) {
+    void format_op_buffs(char * str, size_t max_size, const htp_opnode & node) {
        char * p = str;
+        char * p_end = str + max_size;
        auto inputs = node.get_inputs();

        if (!inputs.empty()) {
-            p += sprintf(p, "%s", tensor_buff_name(inputs[0]));
-
-            for (size_t i = 1; i < inputs.size(); i++) {
-                p += sprintf(p, " x ");
-                p += sprintf(p, "%s", tensor_buff_name(inputs[i]));
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(inputs[0])), (size_t)(p_end - p));
            }

-            p += sprintf(p, " -> ");
+            for (size_t i = 1; i < inputs.size(); i++) {
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
+                }
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(inputs[i])), (size_t)(p_end - p));
+                }
+            }
+
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
+            }
        }

-        p += sprintf(p, "%s", tensor_buff_name(node.dst()));
+        if (p < p_end) {
+            p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(node.dst())), (size_t)(p_end - p));
+        }
    }

-    void format_op_names(char * str, const htp_opnode & node) {
+    void format_op_names(char * str, size_t max_size, const htp_opnode & node) {
        char * p = str;
+        char * p_end = str + max_size;
        auto inputs = node.get_inputs();

        if (!inputs.empty()) {
-            p += sprintf(p, "%s", inputs[0] ? inputs[0]->name : "NONE");
-
-            for (size_t i = 1; i < inputs.size(); i++) {
-                p += sprintf(p, " x ");
-                p += sprintf(p, "%s", inputs[i] ? inputs[i]->name : "NONE");
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[0] ? inputs[0]->name : "NONE"), (size_t)(p_end - p));
            }

-            p += sprintf(p, " -> ");
+            for (size_t i = 1; i < inputs.size(); i++) {
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
+                }
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[i] ? inputs[i]->name : "NONE"), (size_t)(p_end - p));
+                }
+            }
+
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
+            }
        }

-        p += sprintf(p, "%s", node.dst()->name);
+        if (p < p_end) {
+            p += std::min((size_t)snprintf(p, p_end - p, "%s", node.dst()->name), (size_t)(p_end - p));
+        }
+    }
+    void format_kernel_params(char * str, size_t max_size, const htp_opnode & node) {
+        if (node.opcode == HTP_OP_MUL_MAT || node.opcode == HTP_OP_MUL_MAT_ID ||
+            node.opcode == HTP_OP_MUL_MAT_QKV || node.opcode == HTP_OP_MUL_MAT_FFN) {
+            const auto * kparams = (const struct htp_mm_kernel_params *) node.kernel_params;
+            const char * path = "unknown";
+            int32_t type = kparams->kernel_type;
+            if (type == HTP_MM_KERNEL_HMX_2D || type == HTP_MM_KERNEL_HMX_F16_BATCHED) {
+                path = "hmx-tiled";
+            } else if (type == HTP_MM_KERNEL_HVX_F16_F16_VTCM || type == HTP_MM_KERNEL_HVX_F32_F32_VTCM ||
+                       type == HTP_MM_KERNEL_HVX_QUANT_ROW    || type == HTP_MM_KERNEL_HVX_QUANT_BLOCK) {
+                path = "hvx-tiled";
+            } else if (type == HTP_MM_KERNEL_HVX_F16_F16_DDR  || type == HTP_MM_KERNEL_HVX_F16_F32_DDR ||
+                       type == HTP_MM_KERNEL_HVX_F32_F32_DDR  || type == HTP_MM_KERNEL_HVX_F32_F16_DDR ||
+                       type == HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT) {
+                path = "hvx-flat";
+            }
+            snprintf(str, max_size, "%s vtcm %d", path, (int) kparams->vtcm_size);
+        } else {
+            snprintf(str, max_size, "----");
+        }
    }

    void format(const htp_opnode & node) {
-        format_op_dims(dims, node);
-        format_op_strides(strides, node);
-        format_op_types(types, node);
-        format_op_buffs(buffs, node);
-        format_op_names(names, node);
+        format_op_dims(dims, sizeof(dims), node);
+        format_op_strides(strides, sizeof(strides), node);
+        format_op_types(types, sizeof(types), node);
+        format_op_buffs(buffs, sizeof(buffs), node);
+        format_op_names(names, sizeof(names), node);
+        format_kernel_params(kparams, sizeof(kparams), node);
    }

-    htp_opformat() {}
+    htp_opformat() {
+        strides[0] = '\0';
+        dims[0]    = '\0';
+        types[0]   = '\0';
+        buffs[0]   = '\0';
+        names[0]   = '\0';
+        kparams[0] = '\0';
+    }
    htp_opformat(const htp_opnode & node) { format(node); }
 };

--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@ -19,43 +19,9 @@ add_library(${HTP_LIB} SHARED
    htp_iface_skel.c
    worker-pool.c
    hex-dma.c
-)
-
-target_compile_definitions(${HTP_LIB} PRIVATE
-    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
-    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>
-    FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
-
-if (GGML_HEXAGON_FA_EXP2_HF)
-    message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)")
-    target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1)
-endif()
-
-# HMX acceleration: available on v73+ architectures
-set(HTP_HMX_VERSIONS v73 v75 v79 v81)
-list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
-
-if (_hmx_idx GREATER_EQUAL 0)
-    target_sources(${HTP_LIB} PRIVATE
-        hmx-flash-attn-ops.c
-        hmx-matmul-ops.c
-        hmx-queue.c
-    )
-
-    # -mhmx enables HMX instruction set (needed by files that include hmx-utils.h)
-    set_source_files_properties(
-        hmx-flash-attn-ops.c
-        hmx-matmul-ops.c
-        hmx-queue.c
-        PROPERTIES COMPILE_OPTIONS "-mhmx"
-    )
-
-    target_compile_definitions(${HTP_LIB} PRIVATE HTP_HAS_HMX=1)
-endif()
-
-build_idl(htp_iface.idl ${HTP_LIB})
-
-target_sources(${HTP_LIB} PRIVATE
+    hmx-queue.c
+    flash-attn-ops.c
+    hmx-flash-attn-ops.c
    matmul-ops.c
    binary-ops.c
    unary-ops.c
@ -63,7 +29,6 @@ target_sources(${HTP_LIB} PRIVATE
    softmax-ops.c
    act-ops.c
    rope-ops.c
-    flash-attn-ops.c
    set-rows-ops.c
    get-rows-ops.c
    cpy-ops.c
@ -79,6 +44,17 @@ target_sources(${HTP_LIB} PRIVATE
    pad-ops.c
 )

+target_compile_definitions(${HTP_LIB} PRIVATE
+    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
+    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>)
+
+if (GGML_HEXAGON_FA_EXP2_HF)
+    message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)")
+    target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1)
+endif()
+
+build_idl(htp_iface.idl ${HTP_LIB})
+
 set_target_properties(${HTP_LIB} PROPERTIES EXPORT_COMPILE_COMMANDS ON)

 install(TARGETS ${HTP_LIB})
--- a/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake
+++ b/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake
@ -3,7 +3,7 @@ if (HEXAGON_TOOLCHAIN_INCLUDED)
 endif()
 set(HEXAGON_TOOLCHAIN_INCLUDED true)

-#Cross Compiling for Hexagon
+# Cross Compiling for Hexagon
 set(HEXAGON TRUE)
 set(CMAKE_SYSTEM_NAME QURT)
 set(CMAKE_SYSTEM_PROCESSOR Hexagon)
@ -14,7 +14,6 @@ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
 set(CUSTOM_RUNELF_PATH "")

-#To fix backward compatibility with EAI addon.
 if (NOT HEXAGON_SDK_ROOT)
    set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
 endif()
@ -31,7 +30,6 @@ endif()
 file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
 file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}"   HEXAGON_SDK_ROOT)

-#Get the Binary extension of the Hexagon Toolchain
 if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
    set(HEXAGON_TOOLCHAIN_SUFFIX .exe)
 endif()
@ -48,12 +46,12 @@ set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES
    HEXAGON_TOOLS_ROOT
 )

-#QURT Related includes and linker flags
+# QURT Related includes and linker flags
 set(V_ARCH ${HEXAGON_ARCH})
 set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/ADSP${V_ARCH}MP${V_ARCH_EXTN}")
 set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/compute${V_ARCH}${V_ARCH_EXTN}")

-if( ${TREE} MATCHES PAKMAN )
+if (${TREE} MATCHES PAKMAN)
    set(_QURT_INSTALL_DIR "${QURT_IMAGE_DIR}/compute${V_ARCH}${V_ARCH_EXTN}")
 endif()
 message(DEBUG "_QURT_INSTALL_DIR:${_QURT_INSTALL_DIR}")
@ -83,11 +81,9 @@ set(QURT_START_LINK_LIBS
    )
 STRING(REPLACE ";" " " QURT_START_LINK_LIBS "${QURT_START_LINK_LIBS}")

-set(QURT_END_LINK_LIBS
-    ${TARGET_DIR}/fini.o
-    )
+set(QURT_END_LINK_LIBS ${TARGET_DIR}/fini.o)

-#Non QURT related includes and linker flags
+# Non QURT related includes and linker flags

 set(TARGET_DIR_NOOS "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/${HEXAGON_ARCH}")

@ -99,8 +95,10 @@ if (NOT NO_WRAP_MEM_API)
    set(WRAP_MEMALIGN -Wl,--wrap=memalign)
 endif()

+set(ARCH_FLAGS "-mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -mhmx")
+
 set(PIC_SHARED_LD_FLAGS
-    -mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH}
+    ${ARCH_FLAGS}
    -G0
    -fpic
    -Wl,-Bsymbolic
@ -120,13 +118,13 @@ STRING(REPLACE ";" " " PIC_SHARED_LD_FLAGS "${PIC_SHARED_LD_FLAGS}")

 set(HEXAGON_PIC_SHARED_LINK_OPTIONS "${PIC_SHARED_LD_FLAGS}")

-#System include paths
+# System include paths
 include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs)
 include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef)
 include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/ipc/fastrpc/incs)

-#LLVM toolchain setup
-#Compiler paths, options and architecture
+# LLVM toolchain setup
+# Compiler paths, options and architecture
 set(CMAKE_C_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang${HEXAGON_TOOLCHAIN_SUFFIX})
 set(CMAKE_CXX_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX})
 set(CMAKE_AR ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-ar${HEXAGON_TOOLCHAIN_SUFFIX})
@ -137,8 +135,8 @@ set(CMAKE_PREFIX_PATH ${HEXAGON_TOOLCHAIN}/Tools/target/hexagon)
 set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG   "-Wl,-soname,")
 set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Wl,-soname,")

-#Compiler Options
-set(COMMON_FLAGS "-mcpu=hexagon${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -fvectorize -flto -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")
+# Compiler Options
+set(COMMON_FLAGS "${ARCH_FLAGS} -fvectorize -flto -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")

 set(CMAKE_CXX_FLAGS_DEBUG          "${COMMON_FLAGS} -O0 -D_DEBUG -g")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O2 -g")
--- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
@ -18,7 +18,8 @@
 #include "htp-ctx.h"
 #include "htp-ops.h"
 #include "htp-ops.h"
-#include "hmx-ops.h"
+
+int hmx_flash_attn_ext(struct htp_ops_context * octx);

 // Must be multiple of 32
 #define FLASH_ATTN_BLOCK_SIZE (32 * 2)
@ -633,7 +634,6 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
        return HTP_STATUS_NO_SUPPORT;
    }

-#ifdef HTP_HAS_HMX
    // HMX path: head_dim multiple of 64, F16 KV, and no sinks
    if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 64 == 0 && v->ne[0] % 64 == 0 && octx->src[4] == NULL) {
        int ret = hmx_flash_attn_ext(octx);
@ -642,7 +642,6 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
        }
        // VTCM too small or other failure -> fall through to HVX path
    }
-#endif

    struct htp_fa_context factx;
    factx.octx = octx;
--- a/ggml/src/ggml-hexagon/htp/hex-common.h
+++ b/ggml/src/ggml-hexagon/htp/hex-common.h
@ -0,0 +1,80 @@
+#ifndef HEX_COMMON_H
+#define HEX_COMMON_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+
+#ifndef SIZE_MAX
+#define SIZE_MAX ((size_t)-1)
+#endif
+
+#ifndef MAX
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+static inline uint32_t hex_ceil_pow2(uint32_t x) {
+    if (x <= 1) { return 1; }
+    int p = 2;
+    x--;
+    while (x >>= 1) { p <<= 1; }
+    return p;
+}
+
+static inline size_t hmx_ceil_div(size_t num, size_t den) {
+    return (num + den - 1) / den;
+}
+
+static inline int32_t hex_is_aligned(const void * addr, uint32_t align) {
+    return ((size_t) addr & (align - 1)) == 0;
+}
+
+static inline size_t hex_align_up(size_t v, size_t align) {
+    return hmx_ceil_div(v, align) * align;
+}
+
+static inline size_t hex_align_down(size_t v, size_t align) {
+    return (v / align) * align;
+}
+
+static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
+    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
+    uint32_t right_off = left_off + n;
+    return right_off <= chunk_size;
+}
+
+static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
+    return m * ((n + m - 1) / m);
+}
+
+static inline size_t hex_smin(size_t a, size_t b) {
+    return a < b ? a : b;
+}
+
+static inline size_t hex_smax(size_t a, size_t b) {
+    return a > b ? a : b;
+}
+
+static inline void hex_swap_ptr(void ** p1, void ** p2) {
+    void * t = *p1;
+    *p1      = *p2;
+    *p2      = t;
+}
+
+static inline bool hex_mul_overflow(size_t a, size_t b, size_t *out) {
+    if (a != 0 && b > SIZE_MAX / a) return true;
+    *out = a * b;
+    return false;
+}
+
+static inline bool hex_add_overflow(size_t a, size_t b, size_t *out) {
+    if (a > SIZE_MAX - b) return true;
+    *out = a + b;
+    return false;
+}
+
+#endif // HEX_COMMON_H
--- a/ggml/src/ggml-hexagon/htp/hex-dma.h
+++ b/ggml/src/ggml-hexagon/htp/hex-dma.h
@ -5,6 +5,7 @@
 #include <hexagon_types.h>
 #include <stdbool.h>
 #include <stdint.h>
+#include "hex-utils.h"

 #include "hex-profile.h"

@ -127,13 +128,8 @@ static inline dma_ptr dma_make_ptr(void *dst, const void *src)
    return p;
 }

-#if __HVX_ARCH__ < 73
-static const uint32_t dma_src_l2_bypass_on = 1;
-static const uint32_t dma_dst_l2_bypass_on = 0;
-#else
 static const uint32_t dma_src_l2_bypass_on = 1;
 static const uint32_t dma_dst_l2_bypass_on = 1;
-#endif

 static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t size) {
    if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
--- a/ggml/src/ggml-hexagon/htp/hex-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hex-utils.h
@ -11,14 +11,7 @@

 #include "hex-fastdiv.h"
 #include "hex-dump.h"
-
-#ifndef MAX
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#endif
-
-#ifndef MIN
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#endif
+#include "hex-common.h"

 static inline uint64_t hex_get_cycles() {
    uint64_t cycles = 0;
@ -32,54 +25,6 @@ static inline uint64_t hex_get_pktcnt() {
    return pktcnt;
 }

-static inline uint32_t hex_ceil_pow2(uint32_t x) {
-    if (x <= 1) { return 1; }
-    int p = 2;
-    x--;
-    while (x >>= 1) { p <<= 1; }
-    return p;
-}
-
-static inline size_t hmx_ceil_div(size_t num, size_t den) {
-    return (num + den - 1) / den;
-}
-
-static inline int32_t hex_is_aligned(const void * addr, uint32_t align) {
-    return ((size_t) addr & (align - 1)) == 0;
-}
-
-static inline size_t hex_align_up(size_t v, size_t align) {
-    return hmx_ceil_div(v, align) * align;
-}
-
-static inline size_t hex_align_down(size_t v, size_t align) {
-    return (v / align) * align;
-}
-
-static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
-    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
-    uint32_t right_off = left_off + n;
-    return right_off <= chunk_size;
-}
-
-static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
-    return m * ((n + m - 1) / m);
-}
-
-static inline size_t hex_smin(size_t a, size_t b) {
-    return a < b ? a : b;
-}
-
-static inline size_t hex_smax(size_t a, size_t b) {
-    return a > b ? a : b;
-}
-
-static inline void hex_swap_ptr(void ** p1, void ** p2) {
-    void * t = *p1;
-    *p1      = *p2;
-    *p2      = t;
-}
-
 static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, uint32_t height) {
    const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
    Q6_l2fetch_AP((void *) p, control);
--- a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
@ -49,7 +49,7 @@
 // g_br = hex_align_up(gqa_factor * Br, 32) replaces Br for all Q/O/S/P/D dimensions.
 // Layout: Q + O_ping + O_pong + K_dma*2 + V_dma*2 + K_tile + V_tile + S + P + D + vectors + scales
 // Mask is DMA'd into a VTCM buffer (Br rows per KV block) to avoid DDR reads in softmax.
-static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, size_t Br, size_t Bc, size_t n_threads, bool use_pipeline) {
+static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, size_t Br, size_t Bc, size_t n_threads, bool pipeline) {
    const size_t g_br         = hex_align_up(gqa_factor * Br, HMX_FP16_TILE_N_ROWS);
    const size_t q_tile_size  = hex_align_up(g_br * DK * sizeof(__fp16), 4096);    // Q:  [g_br, DK]
    const size_t o_tile_size  = hex_align_up(g_br * DV * sizeof(__fp16), 4096);    // O:  [g_br, DV] x2 ping-pong
@ -70,7 +70,7 @@ static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV,
           + k_dma_size  * 2               // K DMA x2
           + v_dma_size  * 2               // V DMA x2
           + k_tile_size * 1               // K tiles
-           + v_tile_size * (use_pipeline ? 2 : 1) // V tiles (double-buffered if pipelining)
+           + v_tile_size * (pipeline ? 2 : 1) // V tiles (double-buffered if pipelining)
           + s_tile_size * 2               // S + P
           + d_tile_size * 1               // D (diagonal matrix)
           + col_vec_size * 4              // m_vec, l_vec, s_rowmax, p_rowsum
@ -290,7 +290,7 @@ static const int16_t d_tile_scatter_offsets[64] __attribute__((aligned(128))) =

 struct hmx_fa_context {
    const struct htp_ops_context * octx;
-    bool         use_pipeline;  // true when n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads >= 2
+    bool         pipeline;  // true when n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads >= 2
    uint32_t     n_threads;

    // Op parameters
@ -409,7 +409,7 @@ static void fa_v_interleave_thread(unsigned int n, unsigned int i, void * data)
        return;
    }

-    __fp16 * v_tiles_dest = factx->use_pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0];
+    __fp16 * v_tiles_dest = factx->pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0];

    struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start);
@ -1312,13 +1312,13 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
    const size_t g_br = hex_align_up(G * Br, HMX_FP16_TILE_N_ROWS);

    const uint32_t n_kv_blocks  = (nek1 + Bc - 1) / Bc;
-    const bool     use_pipeline = (n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads_init >= 2);
+    const bool     pipeline = (n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads_init >= 2);

    // Bypass thread pool dispatch for small prompts/non-pipelined prefill by setting n_threads = 1
-    const uint32_t n_threads = use_pipeline ? n_threads_init : 1;
+    const uint32_t n_threads = pipeline ? n_threads_init : 1;

    FARF(HIGH, "hmx-fa: neq1=%u nek1=%u DK=%u DV=%u G=%u Br=%zu Bc=%zu g_br=%zu n_kv_blocks=%u pipeline=%d vtcm=%zu",
-         neq1, nek1, DK, DV, G, Br, Bc, g_br, n_kv_blocks, use_pipeline, vtcm_budget);
+         neq1, nek1, DK, DV, G, Br, Bc, g_br, n_kv_blocks, pipeline, vtcm_budget);

    // ======== Build context ========
    struct hmx_fa_context factx;
@ -1339,7 +1339,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
    factx.n_kv_blocks    = n_kv_blocks;
    factx.is_q_fp32      = (q->type == HTP_TYPE_F32);
    factx.is_dst_fp32    = (dst->type == HTP_TYPE_F32);
-    factx.use_pipeline   = use_pipeline;
+    factx.pipeline   = pipeline;
    factx.mask_broadcast = (mask != NULL && mask->ne[2] == 1);

    // Extract op parameters (mutable during softcap adjustment, then stored as const in factx)
@ -1405,7 +1405,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
    factx.vtcm_v_fp16[1]      = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_dma_bytes);
    factx.vtcm_k_tiles        = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, k_tile_bytes);
    factx.vtcm_v_tiles[0]     = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes);
-    if (use_pipeline) {
+    if (pipeline) {
        factx.vtcm_v_tiles[1] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes);
    } else {
        factx.vtcm_v_tiles[1] = NULL;
@ -1456,7 +1456,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
    // ======== HMX lock strategy ========
    // Pipeline: queue thread auto-acquires HMX lock on first push; released by suspend.
    // Fallback: main thread holds the lock (original behavior).
-    if (!factx.use_pipeline) {
+    if (!factx.pipeline) {
        HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
    }

@ -1550,7 +1550,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                const size_t k_src_stride = size_k_row_padded / sizeof(__fp16);
                const size_t v_src_stride = size_v_row_padded / sizeof(__fp16);

-                if (factx.use_pipeline) {
+                if (factx.pipeline) {
                    // ==================================================================
                    // Pipeline path: HVX phases ‖ HMX queue worker
                    // ==================================================================
@ -1780,7 +1780,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                    fa_build_d_diag_inv_l(&factx, n_row_tiles, n_row_tiles_g_br);

                    // HMX: O_final = diag(1/l) @ O_prev
-                    if (factx.use_pipeline) {
+                    if (factx.pipeline) {
                        on_job.o_curr           = o_tile_curr;
                        on_job.o_prev           = o_tile_prev;
                        on_job.d_tiles          = factx.vtcm_d_tiles;
@ -1826,7 +1826,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
        }  // end KV head loop
    }  // end batch loop

-    if (factx.use_pipeline) {
+    if (factx.pipeline) {
        hmx_queue_suspend(ctx->hmx_queue);
    } else {
        HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
--- a/ggml/src/ggml-hexagon/htp/hmx-mm-kernels-tiled.h
+++ b/ggml/src/ggml-hexagon/htp/hmx-mm-kernels-tiled.h
--- a/ggml/src/ggml-hexagon/htp/hmx-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-ops.c
@ -1,6 +0,0 @@
-// HMX operations compiled as a single translation unit.
-// This allows interprocedural optimizations within HMX ops without requiring global HTP LTO.
-
-#include "hmx-queue.c"
-#include "hmx-matmul-ops.c"
-#include "hmx-flash-attn-ops.c"
--- a/ggml/src/ggml-hexagon/htp/hmx-ops.h
+++ b/ggml/src/ggml-hexagon/htp/hmx-ops.h
@ -1,88 +0,0 @@
-// HMX operation entry-point declarations.
-// Ported from htp-ops-lib/include/dsp/ops.h (renamed, benchmark kernels removed). (https://github.com/haozixu/htp-ops-lib)
-
-#ifndef HMX_OPS_H
-#define HMX_OPS_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "htp-ops.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {
-    float        *dst;
-    const float  *activation;
-    const __fp16 *permuted_weight;
-    int           m;
-    int           k;
-    int           n;
-    int           act_stride;
-    int           weight_stride;
-    int           dst_stride;
-    int           ne02;
-    int           ne03;
-    int           ne12;
-    int           ne13;
-    size_t        src0_nb2;
-    size_t        src0_nb3;
-    size_t        src1_nb2;
-    size_t        src1_nb3;
-    size_t        dst_nb2;
-    size_t        dst_nb3;
-} hmx_matmul_f16_f32_batched_params_t;
-
-// HMX matrix multiplication — tile-permuted FP16 weights, FP32 activation/output
-// act_stride: activation row stride in elements (= k for contiguous, or
-//             nb[1]/sizeof(float) for permuted tensors like attention Q).
-// weight_stride: weight row stride in elements (= k for compact weights, or
-//                nb[1]/sizeof(__fp16) for permuted KV-cache views used by QK).
-int hmx_matmul_f16_f32(struct htp_context *ctx,
-                                float *restrict dst,
-                                const float *activation,
-                                const __fp16 *permuted_weight,
-                                int m, int k, int n,
-                                int act_stride,
-                                int weight_stride);
-
-// Batched F16 wrapper over hmx_mat_mul_f16_f32.
-// Batch semantics match ggml_mul_mat(): src0 broadcasts to src1 in dims 2/3.
-int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params);
-
-// HMX matrix multiplication — all supported weight types (F16/F32/Q4_0/Q4_1/Q8_0/IQ4_NL/MXFP4)
-int hmx_matmul_2d_f32(struct htp_context *ctx,
-                                      float *restrict dst,
-                                      const float *activation,
-                                      const uint8_t *permuted_weight,
-                                      int m, int k, int n,
-                                      int act_stride,
-                                      int weight_stride,
-                                      int weight_type);
-
-struct mmid_row_mapping;
-
-int hmx_matmul_id_2d_f32(struct htp_context *ctx,
-                                         float *restrict dst,
-                                         const float *activation,
-                                         const uint8_t *permuted_weight,
-                                         int m, int k, int n,
-                                         int ne11,
-                                         size_t act_nb1, size_t act_nb2,
-                                         size_t dst_nb1, size_t dst_nb2,
-                                         int weight_stride,
-                                         int weight_type,
-                                         const struct mmid_row_mapping *matrix_rows,
-                                         int cur_a,
-                                         int mapping_stride);
-
-// HMX flash attention
-int hmx_flash_attn_ext(struct htp_ops_context * octx);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // HMX_OPS_H
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@ -13,7 +13,9 @@
 #include <stdint.h>
 #include <stdbool.h>

+#ifndef HTP_MAX_NTHREADS
 #define HTP_MAX_NTHREADS 10
+#endif
 #define HTP_MAX_MMAPS    16

 // Memory mapping
@ -42,9 +44,13 @@ struct htp_ops_context {

    enum htp_op_code    op; // FIXME: rename to opcode
    int32_t             op_params[HTP_OP_MAX_PARAMS];
+    int32_t             kernel_params[HTP_OP_MAX_KERN_PARAMS];

    const struct htp_tensor * src[HTP_OP_MAX_INPUTS];
-    const struct htp_tensor * dst;
+    union {
+        const struct htp_tensor * dst;
+        const struct htp_tensor * dsts[HTP_OP_MAX_OUTPUTS];
+    };

    // TODO convert these to an array
    struct htp_spad src0_spad;
@ -87,13 +93,13 @@ struct htp_context {

    struct htp_ops_context octx;

-#ifdef HTP_HAS_HMX
    struct hmx_queue *     hmx_queue; // Async HMX queue for pipeline overlap
-#endif
 };

 int op_matmul(struct htp_ops_context * octx);
 int op_matmul_id(struct htp_ops_context * octx);
+int op_matmul_qkv(struct htp_ops_context * octx);
+int op_matmul_ffn(struct htp_ops_context * octx);
 int op_binary(struct htp_ops_context * octx);
 int op_unary(struct htp_ops_context * octx);
 int op_sum_rows(struct htp_ops_context * octx);
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@ -28,18 +28,19 @@ enum htp_data_type {
    HTP_TYPE_MXFP4  = 39,

    // types used internally for repack, dyn.quant, etc
-    HTP_TYPE_Q4_0x4x2 = 200,
-    HTP_TYPE_Q4_1x4x2,
-    HTP_TYPE_Q8_0x4x2,
-    HTP_TYPE_MXFP4x4x2,
+    HTP_TYPE_Q4_0_TILED = 200,
+    HTP_TYPE_Q4_1_TILED,
+    HTP_TYPE_Q8_0_TILED,
+    HTP_TYPE_MXFP4_TILED,

    HTP_TYPE_INVALID
 };

 // Constats for internal types
-#define QK_Q4_0x4x2  256  // 4x Q4_0  blocks packed with next 4x Q4_0 blocks (size in bytes 128)
-#define QK_Q8_0x4x2  256  // 4x Q8_0  blocks concat with next 4x Q8_0 blocks
-#define QK_MXFP4x4x2 256  // 4x MXFP4 blocks concat with next 4x MXFP4 blocks
+#define QK_Q4_0_TILED  256  // 32x32 Q4_0 tiled layout
+#define QK_Q8_0_TILED  128  // 32x32 Q8_0 tiled layout
+#define QK_MXFP4_TILED 256  // 32x32 MXFP4 tiled layout
+


 // Mask to enable various stages of the Ops.
@ -57,6 +58,8 @@ enum htp_op_code {
    HTP_OP_DIV = 3,
    HTP_OP_MUL_MAT,
    HTP_OP_MUL_MAT_ID,
+    HTP_OP_MUL_MAT_QKV,
+    HTP_OP_MUL_MAT_FFN,
    HTP_OP_RMS_NORM,
    HTP_OP_RMS_NORM_MUL,
    HTP_OP_UNARY_SILU,
@ -99,7 +102,9 @@ enum htp_op_code {

 #define HTP_OP_MAX_DIMS    4    // aka GGML_MAX_DIMS
 #define HTP_OP_MAX_INPUTS  6    // aka GGML_MAX_SRCS
+#define HTP_OP_MAX_OUTPUTS 4
 #define HTP_OP_MAX_PARAMS  16   // aka GGML_MAX_OP_PARAMS
+#define HTP_OP_MAX_KERN_PARAMS 32

 #define HTP_OP_MAX_BUFS    16
 #define HTP_OP_MAX_REQS    256
@ -142,8 +147,10 @@ struct htp_op_desc {
    uint32_t opcode;                    // GGML/HTP Op
    uint32_t flags;                     // Op flags
    int32_t  params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm
+    int32_t  kernel_params[HTP_OP_MAX_KERN_PARAMS]; // generic blob for host-precomputed parameters
    uint16_t src[HTP_OP_MAX_INPUTS];    // Input tensors indices
-    uint16_t dst;                       // Output tensor index
+    uint16_t dst[HTP_OP_MAX_OUTPUTS];   // Output tensor indices
+    uint16_t pad[2];                    // padding to align to 64 bits
 };

 #ifndef HTP_MAX_NTHREADS
--- a/ggml/src/ggml-hexagon/htp/htp_iface.idl
+++ b/ggml/src/ggml-hexagon/htp/htp_iface.idl
@ -11,12 +11,13 @@ struct htp_iface_pmu_conf {
 };

 interface htp_iface : remote_handle64 {
-    AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx, in uint64 max_vmem);
+    AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 n_hmx, in uint64 max_vmem);
    AEEResult stop();
    AEEResult mmap(in uint32 fd, in uint32 size);
    AEEResult munmap(in uint32 fd);
    AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu);
    AEEResult etm(in uint32 enable);
+    AEEResult hwinfo(rout uint32 n_threads, rout uint32 n_hvx, rout uint32 n_hmx, rout uint64 vtcm_size);
 };

 #endif /* HTP_IDL */
--- a/ggml/src/ggml-hexagon/htp/hvx-base.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-base.h
@ -170,25 +170,7 @@ static inline HVX_VectorPair hvx_vec_f16_to_f32(HVX_Vector v) {
 }
 #endif

-/* Q6_Vsf_equals_Vw is only available on v73+.*/
-#if __HVX_ARCH__ < 73
-static inline HVX_Vector hvx_vec_i32_to_qf32(HVX_Vector const in)
-{
-    HVX_Vector const vzero = Q6_V_vzero();
-    HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
-    HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
-    HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
-    HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
-    HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
-    HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
-    return ret;
-}

-static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
-{
-    return Q6_Vsf_equals_Vqf32(hvx_vec_i32_to_qf32(in));
-}
-#endif

 static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) {
    // This looks complicated.
@ -305,4 +287,17 @@ static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {

 #endif // __HVX_ARCH__ < 79

+static inline HVX_Vector hvx_vec_load_act_tile(const uint8_t * y_q, uint32_t kt, HVX_Vector * v_act_all) {
+    if (kt % 4 == 0) {
+        *v_act_all = hvx_vmem(y_q + kt * 32);
+        return *v_act_all;
+    } else if (kt % 4 == 1) {
+        return Q6_V_vror_VR(*v_act_all, 32);
+    } else if (kt % 4 == 2) {
+        return Q6_V_vror_VR(*v_act_all, 64);
+    } else {
+        return Q6_V_vror_VR(*v_act_all, 96);
+    }
+}
+
 #endif /* HVX_BASE_H */
--- a/ggml/src/ggml-hexagon/htp/hvx-mm-kernels-flat.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-mm-kernels-flat.h
--- a/ggml/src/ggml-hexagon/htp/hvx-mm-kernels-tiled.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-mm-kernels-tiled.h
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@ -361,7 +361,7 @@ static void vtcm_free(struct htp_context * ctx) {
 static void htp_packet_callback(dspqueue_t queue, int error, void * context);
 static void htp_error_callback(dspqueue_t queue, int error, void * context);

-AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx, uint64_t max_vmem) {
+AEEResult htp_iface_start(remote_handle64 handle, uint32_t sess_id, uint64_t dsp_queue_id, uint32_t n_hvx, uint32_t n_hmx, uint64_t max_vmem) {
    struct htp_context * ctx = (struct htp_context *) handle;

    if (!ctx) {
@ -395,10 +395,9 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
        return AEE_ENOMEMORY;
    }

-#ifdef HTP_HAS_HMX
-    ctx->hmx_enabled = use_hmx;
+    ctx->hmx_enabled = n_hmx;
    ctx->hmx_queue   = NULL;
-    if (use_hmx) {
+    if (n_hmx) {
        ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
        if (ctx->hmx_queue) {
            ctx->hmx_queue->trace = &ctx->trace[HTP_MAX_NTHREADS];
@ -407,8 +406,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
            ctx->hmx_enabled = false;
        }
    }
-    FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx);
-#endif
+    FARF(HIGH, "HMX %s (n_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", n_hmx);

    qurt_sysenv_max_hthreads_t hw_threads;
    qurt_sysenv_get_max_hw_threads(&hw_threads);
@ -481,13 +479,11 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
        dma_queue_delete(ctx->dma[i]);
    }

-#ifdef HTP_HAS_HMX
    if (ctx->hmx_queue) {
        hmx_queue_delete(ctx->hmx_queue);
        ctx->hmx_queue = NULL;
    }
    ctx->hmx_enabled = false;
-#endif

    vtcm_free(ctx);

@ -500,6 +496,36 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
    return AEE_SUCCESS;
 }

+AEEResult htp_iface_hwinfo(remote_handle64 handle, uint32_t * n_threads, uint32_t * n_hvx, uint32_t * n_hmx, uint64_t * vtcm_size) {
+    (void)handle;
+    if (!n_threads || !n_hvx || !n_hmx || !vtcm_size) {
+        return AEE_EBADPARM;
+    }
+
+    qurt_sysenv_max_hthreads_t hw_threads;
+    qurt_sysenv_get_max_hw_threads(&hw_threads);
+    uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
+
+    uint32_t n_hvx_val = hw_nhvx;
+    if (n_hvx_val > hw_threads.max_hthreads) {
+        n_hvx_val = hw_threads.max_hthreads;
+    }
+    if (n_hvx_val > HTP_MAX_NTHREADS) {
+        n_hvx_val = HTP_MAX_NTHREADS;
+    }
+
+    // for now we force n_threads == n_hvx
+    *n_threads = n_hvx_val;
+    *n_hvx     = n_hvx_val;
+    *n_hmx     = 1;
+
+    uint32_t vtcm_sz = 8 * 1024 * 1024; // 8MB default fallback
+    HAP_compute_res_query_VTCM(0, (unsigned int *)&vtcm_sz, NULL, NULL, NULL);
+    *vtcm_size = vtcm_sz;
+
+    return AEE_SUCCESS;
+}
+
 static void htp_error_callback(dspqueue_t queue, int error, void * context) {
    // No errors expected on the DSP.
    FARF(ERROR, "Error callback: 0x%08x", (unsigned) error);
@ -554,6 +580,12 @@ static int execute_op(struct htp_ops_context * octx) {
        case HTP_OP_MUL_MAT_ID:
            return op_matmul_id(octx);

+        case HTP_OP_MUL_MAT_QKV:
+            return op_matmul_qkv(octx);
+
+        case HTP_OP_MUL_MAT_FFN:
+            return op_matmul_ffn(octx);
+
        case HTP_OP_MUL:
        case HTP_OP_ADD:
        case HTP_OP_SUB:
@ -762,8 +794,9 @@ static void prep_tensors(struct htp_context *ctx, struct htp_buf_desc *bufs, str
    }
 }

-static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) {
+static int proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) {
    memcpy(octx->op_params, op->params, sizeof(octx->op_params));
+    memcpy(octx->kernel_params, op->kernel_params, sizeof(octx->kernel_params));
    octx->flags = op->flags;
    octx->op    = op->opcode;

@ -785,22 +818,41 @@ static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens,
            src->ne[0], src->ne[1], src->ne[3], src->ne[3]);
    }

-    // Prep output tensor
-    struct htp_tensor *dst = tens + op->dst;
+    // Prep output tensors
+    for (uint32_t i = 0; i < HTP_OP_MAX_OUTPUTS; i++) {
+        uint16_t dst_idx = op->dst[i];
+        if (dst_idx == 0xffff) {
+            octx->dsts[i] = NULL;
+            continue;
+        }
+        struct htp_tensor *dst = tens + dst_idx;
+        octx->dsts[i] = dst;

-    octx->dst = dst;
+        FARF(HIGH, "prep-dst[%u] #%u: data %p size %u : %u:%u:%u:%u", i, dst_idx, (void*) dst->data, dst->size,
+            dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
+    }

-    FARF(HIGH, "prep-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
-        dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
+    int status = execute_op(octx);

-    (void) execute_op(octx);
+    octx->src0_spad.src = NULL;
+    octx->src1_spad.src = NULL;
+    octx->src2_spad.src = NULL;
+    octx->src3_spad.src = NULL;
+    octx->dst_spad.src  = NULL;

    // flush buffers on output
-    hex_l2flush((void *) dst->data, dst->size);
-    dst->flags |= HTP_TENSOR_FLUSHED;
+    for (uint32_t i = 0; i < HTP_OP_MAX_OUTPUTS; i++) {
+        if (octx->dsts[i]) {
+            struct htp_tensor *dst = (struct htp_tensor *)octx->dsts[i];
+            hex_l2flush((void *) dst->data, dst->size);
+            dst->flags |= HTP_TENSOR_FLUSHED;

-    FARF(HIGH, "post-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
-        dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
+            FARF(HIGH, "post-dst[%u] #%u: data %p size %u : %u:%u:%u:%u", i, op->dst[i], (void*) dst->data, dst->size,
+                dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
+        }
+    }
+
+    return status;
 }

 #define DSPQUEUE_POLL_TIMEOUT_USEC 100
@ -892,20 +944,26 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
            }
        }

+        int      op_status = HTP_STATUS_OK;
+        uint32_t op_wakeup = n_ops / 2; // half-way throgh the batch
+
        for (uint32_t i=0; i < n_ops; i++) {
            struct profile_data prof;

-            if (i == (n_ops-1)) {
-                // wake up the host before starting the last op
+            if (i == op_wakeup) {
                dspqueue_write_early_wakeup_noblock(queue, 0, 0);
            }

            profile_start(ctx->profiler, &prof);

-            proc_op_req(octx, tens, i, &ops[i]);
+            op_status = proc_op_req(octx, tens, i, &ops[i]);

            profile_stop(ctx->profiler, &prof);

+            if (op_status != HTP_STATUS_OK) {
+                break;
+            }
+
            if (ctx->profiler) {
                pds[i].opcode = ops[i].opcode;
                pds[i].usecs  = prof.usecs;
@ -919,7 +977,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {

        struct htp_opbatch_rsp rsp;
        rsp.id        = req.id;
-        rsp.status    = HTP_STATUS_OK;
+        rsp.status    = op_status;
        rsp.n_bufs    = n_bufs;
        rsp.n_tensors = n_tens;
        rsp.n_ops     = n_ops;
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.h
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.h
@ -0,0 +1,508 @@
+#ifndef HTP_MATMUL_OPS_H
+#define HTP_MATMUL_OPS_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "htp-ops.h"
+#include "hex-fastdiv.h"
+#include "hex-common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --- HMX Tile Constraints ---
+#define HTP_MM_HMX_TILE_N_COLS 32
+#define HTP_MM_HMX_TILE_N_ROWS 32
+#define HTP_MM_HMX_TILE_SIZE   (32 * 32 * sizeof(__fp16)) // 2048 bytes
+#define HTP_MM_HMX_TILE_N_ELMS 1024
+#define HTP_MM_HMX_MIN_NROWS   4
+
+// --- Weight Repacked Tile Sizes ---
+#define HTP_MM_WEIGHT_TILE_SIZE_Q4_0   576
+#define HTP_MM_WEIGHT_TILE_SIZE_Q4_1   640
+#define HTP_MM_WEIGHT_TILE_SIZE_Q8_0   1088
+#define HTP_MM_WEIGHT_TILE_SIZE_IQ4_NL 576
+#define HTP_MM_WEIGHT_TILE_SIZE_MXFP4  544
+
+// --- Weight Repacked Aligned Tile Sizes ---
+#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_0   640
+#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_1   640
+#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q8_0   1152
+#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_IQ4_NL 640
+#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_MXFP4  640
+
+// --- Activation Tiled Block Sizes (including padding) ---
+#define HTP_MM_ACT_TILE_SIZE_Q8_0      1152
+#define HTP_MM_ACT_TILE_SIZE_Q8_1      1280
+
+#define HTP_MM_MAX_PREFETCH 16
+
+// --- Solver Cost Model Penalty Weights (HMX-specific) ---
+#define HTP_MM_HMX_COST_W_DEQUANT 3 // cost penalty for quantized weight loading/dequantization
+#define HTP_MM_HMX_COST_A_CONVERT 2 // cost penalty for activation loading/conversion
+
+// --- DMA Activation Transfer Configuration ---
+#define HTP_MM_DMA_ACT_ROWS_PER_STEP 2
+#define HTP_MM_DMA_ACT_MULTIPLIER    4
+
+enum htp_mm_kernel_type {
+    HTP_MM_KERNEL_UNSUPPORTED = 0,
+
+    // HMX paths
+    HTP_MM_KERNEL_HMX_2D,
+    HTP_MM_KERNEL_HMX_F16_BATCHED,
+
+    // HVX floating-point paths
+    HTP_MM_KERNEL_HVX_F16_F16_VTCM,
+    HTP_MM_KERNEL_HVX_F16_F16_DDR,
+    HTP_MM_KERNEL_HVX_F16_F32_DDR,
+
+    HTP_MM_KERNEL_HVX_F32_F32_VTCM,
+    HTP_MM_KERNEL_HVX_F32_F32_DDR,
+    HTP_MM_KERNEL_HVX_F32_F16_DDR,
+
+    // HVX quantized paths
+    HTP_MM_KERNEL_HVX_QUANT_ROW,      // standard row-wise parallel quantization
+    HTP_MM_KERNEL_HVX_QUANT_BLOCK,    // parallel block-wise quantization
+    HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT, // row-wise fallback flat quantization
+};
+
+// Op-specific struct for precomputed matmul params
+struct htp_mm_kernel_params {
+    int32_t  kernel_type;        // enum htp_mm_kernel_type
+    int32_t  pipeline;           // 1 = pipelined execution, 0 = standard
+    int32_t  m_chunk;            // Row chunk size (M chunk)
+    int32_t  n_chunk;            // Col chunk size (N chunk)
+    int32_t  n_threads;          // Number of threads to spawn
+    int32_t  n_act_threads;      // Number of threads for activation preparation
+    int32_t  n_hmx;              // 1 = use HMX, 0 = use HVX
+    int32_t  n_prefetch;         // Prefetch lookahead buffers/rows in VTCM
+    int32_t  tile_size;          // Weight tile size
+    int32_t  aligned_tile_size;  // Aligned weight tile size (padded to 128)
+    int32_t  src1_row_size;      // Row size for quantized activation
+    int32_t  vtcm_size;          // Total required scratchpad size in VTCM
+    int32_t  vtcm_src0_size;     // src0 scratchpad size in VTCM
+    int32_t  vtcm_src1_size;     // src1 scratchpad size in VTCM
+    int32_t  vtcm_src2_size;     // src2 scratchpad size in VTCM (fused only)
+    int32_t  vtcm_src3_size;     // src3 scratchpad size in VTCM (fused only)
+    int32_t  vtcm_dst_size;      // dst scratchpad size in VTCM
+
+    // Precomputed division values
+    struct fastdiv_values div_ne12_ne1;
+    struct fastdiv_values div_ne1;
+    struct fastdiv_values div_r2;
+    struct fastdiv_values div_r3;
+    struct fastdiv_values div_ne11;
+};
+
+#if defined(__cplusplus)
+static_assert(sizeof(struct htp_mm_kernel_params) <= 128, "htp_matmul_kernel_params is too large for kernel_params blob");
+#else
+_Static_assert(sizeof(struct htp_mm_kernel_params) <= 128, "htp_matmul_kernel_params is too large for kernel_params blob");
+#endif
+
+struct mmid_row_mapping {
+    uint32_t i1;
+    uint32_t i2;
+};
+
+// Search for optimal (mc, nc) chunk sizes within VTCM budget.
+static inline int htp_mm_hmx_compute_chunks(size_t   vtcm_total,
+                              size_t   overhead,
+                              size_t   per_n_cost,
+                              size_t   per_m_cost,
+                              size_t   per_mn_cost,
+                              size_t   m,
+                              size_t   n,
+                              size_t   m_block_cost,
+                              size_t   n_block_cost,
+                              size_t * m_chunk_out,
+                              size_t * n_chunk_out,
+                              size_t * total_out) {
+    if (m == 0 || n == 0) return -1;
+    if (vtcm_total <= overhead) return -1;
+    if (per_n_cost == 0 || per_m_cost == 0 || per_mn_cost == 0) return -1;
+
+    const size_t usable = vtcm_total - overhead;
+
+    size_t best_cost = SIZE_MAX;
+    size_t best_mn   = 0;
+    size_t best_m = 0, best_n = 0;
+
+    const size_t n_max = hex_align_down((size_t)n, HTP_MM_HMX_TILE_N_COLS);
+    for (size_t nc = n_max; nc >= HTP_MM_HMX_TILE_N_COLS; nc -= HTP_MM_HMX_TILE_N_COLS) {
+        size_t n_fixed = 0, ncmn = 0, mc_denom = 0;
+        if (hex_mul_overflow(nc, per_n_cost, &n_fixed)) continue;
+        if (n_fixed >= usable) goto next_nc;
+
+        if (hex_mul_overflow(nc, per_mn_cost, &ncmn)) goto next_nc;
+        if (hex_add_overflow(per_m_cost, ncmn, &mc_denom) || mc_denom == 0) goto next_nc;
+
+        {
+            size_t remain = usable - n_fixed;
+            size_t mc = remain / mc_denom;
+            mc = hex_align_down(mc, HTP_MM_HMX_TILE_N_ROWS);
+            mc = hex_smin(mc, m);
+
+            if (mc == 0) {
+                goto next_nc;
+            }
+
+            size_t mblocks = ((size_t) m + mc - 1) / mc;
+            size_t nblocks = ((size_t) n + nc - 1) / nc;
+            size_t cost    = mblocks * m_block_cost + nblocks * n_block_cost;
+            size_t mn      = mc * nc;
+            if (cost < best_cost || (cost == best_cost && mn > best_mn)) {
+                best_cost = cost;
+                best_mn   = mn;
+                best_m    = mc;
+                best_n    = nc;
+            }
+        }
+
+next_nc:
+        if (nc == HTP_MM_HMX_TILE_N_COLS) break;  // avoid size_t underflow
+    }
+
+    if (best_m == 0 || best_n == 0) return -1;
+
+    // Compute exact total (with overflow checks)
+    size_t t0 = 0, t1 = 0, t2 = 0, mn = 0, total = 0;
+    if (hex_mul_overflow(best_n, per_n_cost, &t0)) return -1;
+    if (hex_mul_overflow(best_m, per_m_cost, &t1)) return -1;
+    if (hex_mul_overflow(best_m, best_n, &mn))     return -1;
+    if (hex_mul_overflow(mn, per_mn_cost, &t2))    return -1;
+    if (hex_add_overflow(t0, t1, &total))          return -1;
+    if (hex_add_overflow(total, t2, &total))       return -1;
+    if (hex_add_overflow(total, overhead, &total)) return -1;
+
+    *m_chunk_out = best_m;
+    *n_chunk_out = best_n;
+    *total_out   = total;
+    return 0;
+}
+
+// --- Tile Size Helpers ---
+static inline uint32_t htp_mm_get_weight_tile_size(int weight_type) {
+    switch (weight_type) {
+        case HTP_TYPE_Q4_0:
+        case HTP_TYPE_IQ4_NL:
+            return HTP_MM_WEIGHT_TILE_SIZE_Q4_0;
+        case HTP_TYPE_Q4_1:
+            return HTP_MM_WEIGHT_TILE_SIZE_Q4_1;
+        case HTP_TYPE_Q8_0:
+            return HTP_MM_WEIGHT_TILE_SIZE_Q8_0;
+        case HTP_TYPE_MXFP4:
+            return HTP_MM_WEIGHT_TILE_SIZE_MXFP4;
+        default:
+            return 0;
+    }
+}
+
+static inline uint32_t htp_mm_get_weight_aligned_tile_size(int weight_type) {
+    switch (weight_type) {
+        case HTP_TYPE_Q4_0:
+        case HTP_TYPE_IQ4_NL:
+            return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_0;
+        case HTP_TYPE_Q4_1:
+            return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_1;
+        case HTP_TYPE_Q8_0:
+            return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q8_0;
+        case HTP_TYPE_MXFP4:
+            return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_MXFP4;
+        default:
+            return 0;
+    }
+}
+
+// --- Activation/Row Size Helpers ---
+static inline size_t htp_mm_q8_0_tiled_row_size(uint32_t ne) {
+    const uint32_t ne_padded = ((ne + 127) / 128) * 128;
+    const uint32_t nb_32 = ne_padded / 32;
+    return nb_32 * HTP_MM_ACT_TILE_SIZE_Q8_0;
+}
+
+static inline size_t htp_mm_q8_1_tiled_row_size(uint32_t ne) {
+    const uint32_t ne_padded = ((ne + 127) / 128) * 128;
+    const uint32_t nb_32 = ne_padded / 32;
+    return nb_32 * HTP_MM_ACT_TILE_SIZE_Q8_1;
+}
+
+static inline size_t htp_mm_q8_0_flat_row_size(uint32_t ne) {
+    const uint32_t quants_size = hex_align_up(ne, 128);
+    const uint32_t num_scales = (ne + 31) / 32;
+    const uint32_t scales_size = hex_align_up(num_scales * 2, 128);
+    return quants_size + scales_size;
+}
+
+static inline size_t htp_mm_q8_1_flat_row_size(uint32_t ne) {
+    const uint32_t quants_size = hex_align_up(ne, 128);
+    const uint32_t num_scales = (ne + 31) / 32;
+    const uint32_t scales_size = hex_align_up(num_scales * 4, 128);
+    return quants_size + scales_size;
+}
+
+static inline size_t htp_mm_get_tiled_row_stride(int weight_type, uint32_t k) {
+    uint32_t nb = (k + QK_Q4_0_TILED - 1) / QK_Q4_0_TILED;
+    switch (weight_type) {
+        case HTP_TYPE_Q4_0:
+        case HTP_TYPE_IQ4_NL:
+        case HTP_TYPE_Q4_1:
+        case HTP_TYPE_Q8_0:
+        case HTP_TYPE_MXFP4:
+            return (size_t) nb * htp_mm_get_weight_tile_size(weight_type);
+        case HTP_TYPE_F16:
+            return (size_t) k * sizeof(__fp16);
+        case HTP_TYPE_F32:
+            return (size_t) k * sizeof(float);
+        default:
+            return 0;
+    }
+}
+
+static inline size_t htp_mm_round_up(size_t n, size_t m) {
+    return ((n + m - 1) / m) * m;
+}
+
+static inline bool htp_mm_hmx_pipeline(uint32_t m) {
+    return m > 32;
+}
+
+static inline void htp_mm_hmx_get_2d_chunk_costs(
+    int wtype, uint32_t k, bool pipeline, uint32_t aligned_tile_size,
+    size_t * size_per_n_out, size_t * size_per_m_out, size_t * size_per_mn_out
+) {
+    const bool is_quant = (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32);
+    const size_t row_stride = htp_mm_get_tiled_row_stride(wtype, k);
+    const size_t vec_dot_size = k * sizeof(uint16_t);
+    const uint32_t n_k_tiles = k / HTP_MM_HMX_TILE_N_COLS;
+    const size_t qweight_row_stride = is_quant ? (size_t)(n_k_tiles * aligned_tile_size) / 32 : 0;
+
+    *size_per_n_out = (pipeline ? 2 : 1) * (is_quant ? qweight_row_stride : row_stride) +
+                      (pipeline ? 2 * vec_dot_size : vec_dot_size);
+    *size_per_m_out = vec_dot_size;
+    *size_per_mn_out = (pipeline ? 2 : 1) * sizeof(uint16_t);
+}
+
+static inline void htp_mm_hmx_get_batched_chunk_costs(
+    uint32_t k, uint32_t group_size,
+    size_t * size_per_n_out, size_t * size_per_m_out, size_t * size_per_mn_out
+) {
+    const size_t vec_dot_size = k * sizeof(uint16_t);
+    *size_per_n_out = 3 * vec_dot_size;
+    *size_per_m_out = group_size * vec_dot_size;
+    *size_per_mn_out = sizeof(uint16_t);
+}
+
+static inline size_t htp_mm_hmx_get_2d_vtcm_size(
+    int wtype, uint32_t k, size_t mc, size_t nc, bool pipeline, uint32_t act_threads, uint32_t aligned_tile_size
+) {
+    const uint32_t n_k_tiles = k / HTP_MM_HMX_TILE_N_COLS;
+    const bool is_quant = (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32);
+    const size_t row_stride = htp_mm_get_tiled_row_stride(wtype, k);
+    const size_t vec_dot_size = k * sizeof(uint16_t);
+
+    const size_t act_f32_size = htp_mm_round_up(act_threads * 4 * k * sizeof(float), HTP_MM_HMX_TILE_SIZE);
+    size_t weight_area_size = is_quant
+        ? htp_mm_round_up((nc / 32) * n_k_tiles * aligned_tile_size, HTP_MM_HMX_TILE_SIZE)
+        : htp_mm_round_up(nc * row_stride, HTP_MM_HMX_TILE_SIZE);
+    if (pipeline) {
+        weight_area_size *= 2;
+    }
+    const size_t act_area_size    = htp_mm_round_up(mc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
+    const size_t output_area_size = htp_mm_round_up(mc * nc * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
+
+    size_t scratch0_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
+    size_t scratch1_size = pipeline ? scratch0_size : 0;
+    size_t scratch2_size = pipeline ? output_area_size : 0;
+
+    return weight_area_size + act_area_size + act_f32_size + output_area_size +
+           scratch0_size + scratch1_size + scratch2_size + 256;
+}
+
+static inline size_t htp_mm_hmx_get_batched_vtcm_size(
+    int wtype, uint32_t k, size_t mc, size_t nc, uint32_t group_size, bool use_dma_activation, bool pipeline, uint32_t act_threads) {
+    (void)wtype;
+    (void)pipeline;
+    const size_t vec_dot_size     = k * sizeof(uint16_t);
+    const size_t f32_scratch_size = use_dma_activation
+        ? htp_mm_round_up(act_threads * 4 * k * sizeof(float), HTP_MM_HMX_TILE_SIZE) : 0;
+
+    const size_t act_head_stride   = mc * k;
+    const size_t weight_area_size  = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
+    const size_t act_area_size     = htp_mm_round_up(group_size * act_head_stride * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
+    const size_t output_area_size  = htp_mm_round_up(group_size * mc * nc * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
+    const size_t scratch_area_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
+
+    return weight_area_size + act_area_size + output_area_size +
+           2 * scratch_area_size + 256 + f32_scratch_size;
+}
+
+static inline size_t htp_mm_hvx_get_vtcm_sizes(
+    int kernel_type,
+    int wtype,
+    uint32_t ne10,       // k
+    uint32_t src1_nrows, // m_total (or act_nrows)
+    uint32_t n_threads,
+    size_t dst_row_size,
+    size_t src0_row_size,
+    size_t src1_row_size,
+    uint32_t n_prefetch,
+    size_t * vtcm_src0_size_out,
+    size_t * vtcm_src1_size_out,
+    size_t * vtcm_dst_size_out
+) {
+    size_t vtcm_src0_size = 0;
+    size_t vtcm_src1_size = 0;
+    size_t vtcm_dst_size  = 0;
+
+    const bool is_repack = (wtype == HTP_TYPE_Q4_0 || wtype == HTP_TYPE_Q4_1 ||
+                            wtype == HTP_TYPE_Q8_0 || wtype == HTP_TYPE_IQ4_NL ||
+                            wtype == HTP_TYPE_MXFP4);
+
+    const size_t src0_row_size_padded = htp_mm_round_up(src0_row_size, 128);
+    const size_t dst_nrows = (src1_nrows > 1) ? 0 : 1;
+
+    switch (kernel_type) {
+        case HTP_MM_KERNEL_HVX_F16_F16_VTCM: {
+            size_t f16_src1_row_size = htp_mm_round_up(ne10 * 2, 128);
+            vtcm_src1_size = htp_mm_round_up(f16_src1_row_size * src1_nrows, 256);
+            vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256) * n_threads;
+            vtcm_dst_size  = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
+            break;
+        }
+        case HTP_MM_KERNEL_HVX_F16_F32_DDR:
+        case HTP_MM_KERNEL_HVX_F16_F16_DDR:
+        case HTP_MM_KERNEL_HVX_F32_F32_DDR:
+        case HTP_MM_KERNEL_HVX_F32_F16_DDR: {
+            vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size, 256) * n_threads;
+            vtcm_src1_size = htp_mm_round_up(n_prefetch * src1_row_size, 256) * n_threads;
+            vtcm_dst_size  = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
+            break;
+        }
+        case HTP_MM_KERNEL_HVX_F32_F32_VTCM: {
+            size_t f32_src1_row_size = htp_mm_round_up(ne10 * 4, 128);
+            vtcm_src1_size = htp_mm_round_up(f32_src1_row_size * src1_nrows, 256);
+            vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256) * n_threads;
+            vtcm_dst_size  = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
+            break;
+        }
+        case HTP_MM_KERNEL_HVX_QUANT_BLOCK:
+        case HTP_MM_KERNEL_HVX_QUANT_ROW: {
+            size_t q_src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10) : htp_mm_q8_0_tiled_row_size(ne10);
+
+            vtcm_dst_size  = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) : 0;
+            vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
+            vtcm_src1_size = htp_mm_round_up(q_src1_row_size * src1_nrows, 256);
+
+            // src0 spad is also used in dynamic quantizer to store padded src1 rows
+            size_t src1_row_size_padded = htp_mm_round_up(q_src1_row_size, QK_Q8_0_TILED * sizeof(float));
+            if (vtcm_src0_size < src1_row_size_padded) {
+                vtcm_src0_size = src1_row_size_padded;
+            }
+
+            vtcm_src0_size = vtcm_src0_size * n_threads;
+            vtcm_dst_size  = vtcm_dst_size * n_threads;
+
+            if (is_repack) {
+                uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
+                uint32_t n_k_tiles = ne10 / 32;
+                uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
+                size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
+                if (repacked_vtcm_size < src1_row_size_padded) {
+                    repacked_vtcm_size = src1_row_size_padded;
+                }
+                vtcm_src0_size = repacked_vtcm_size * n_threads;
+            }
+            break;
+        }
+        case HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT: {
+            size_t q_src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_flat_row_size(ne10) : htp_mm_q8_0_flat_row_size(ne10);
+
+            vtcm_dst_size  = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) : 0;
+            vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
+            vtcm_src1_size = htp_mm_round_up(q_src1_row_size * src1_nrows, 256);
+
+            size_t src1_row_size_padded = htp_mm_round_up(q_src1_row_size, 256);
+            if (vtcm_src0_size < src1_row_size_padded) {
+                vtcm_src0_size = src1_row_size_padded;
+            }
+
+            vtcm_src0_size = vtcm_src0_size * n_threads;
+            vtcm_dst_size  = vtcm_dst_size * n_threads;
+
+            if (is_repack) {
+                uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
+                uint32_t n_k_tiles = ne10 / 32;
+                uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
+                size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
+                if (repacked_vtcm_size < src1_row_size_padded) {
+                    repacked_vtcm_size = src1_row_size_padded;
+                }
+                vtcm_src0_size = repacked_vtcm_size * n_threads;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+
+    *vtcm_src0_size_out = vtcm_src0_size;
+    *vtcm_src1_size_out = vtcm_src1_size;
+    *vtcm_dst_size_out  = vtcm_dst_size;
+
+    return vtcm_src0_size + vtcm_src1_size + vtcm_dst_size;
+}
+
+static inline size_t htp_mm_hvx_id_get_vtcm_sizes(
+    int wtype,
+    uint32_t ne10,                // k
+    uint32_t src1_nrows,
+    uint32_t n_threads,
+    size_t src0_row_size,    // nb01
+    uint32_t n_prefetch,
+    size_t * vtcm_src0_size_out,
+    size_t * vtcm_src1_size_out
+) {
+    const bool is_repack = (wtype == HTP_TYPE_Q4_0 || wtype == HTP_TYPE_Q4_1 ||
+                            wtype == HTP_TYPE_Q8_0 || wtype == HTP_TYPE_IQ4_NL ||
+                            wtype == HTP_TYPE_MXFP4);
+
+    const size_t src0_row_size_padded = htp_mm_round_up(src0_row_size, 128);
+    const size_t src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10)
+                                                          : htp_mm_q8_0_tiled_row_size(ne10);
+
+    size_t src0_sz_per_thread = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
+    size_t src1_sz            = htp_mm_round_up(src1_row_size * src1_nrows, 256);
+
+    // src0 spad also holds temporary transposed src1 columns during dynamic quantization.
+    const size_t src1_row_size_padded = htp_mm_round_up(src1_row_size, QK_Q8_0_TILED * sizeof(float));
+    if (src0_sz_per_thread < src1_row_size_padded) {
+        src0_sz_per_thread = src1_row_size_padded;
+    }
+
+    if (is_repack) {
+        const uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
+        const uint32_t n_k_tiles    = ne10 / 32;
+        const uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
+        size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
+        if (repacked_vtcm_size < src1_row_size_padded) {
+            repacked_vtcm_size = src1_row_size_padded;
+        }
+        src0_sz_per_thread = repacked_vtcm_size;
+    }
+
+    const size_t vtcm_src0_size = src0_sz_per_thread * n_threads;
+
+    *vtcm_src0_size_out = vtcm_src0_size;
+    *vtcm_src1_size_out = src1_sz;
+
+    return vtcm_src0_size + src1_sz;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // HTP_MATMUL_OPS_H
--- a/ggml/src/ggml-hexagon/htp/ssm-conv.c
+++ b/ggml/src/ggml-hexagon/htp/ssm-conv.c
@ -183,24 +183,25 @@ static inline void hvx_transpose_32x32_f32(HVX_Vector m[32]) {
 // transposed into VTCM.
 //
 // VTCM layouts (per thread):
-//   src1_T : {d_inner_per_thread, d_conv}   — staged once per launch (small).
-//   src0_T : {d_inner_tile,     ncs}        — staged per d_inner-tile.
+//   src1_T : {d_inner_stride, d_conv}       - staged once per launch (small).
+//   src0_T : {d_inner_tile,     ncs}        - staged per d_inner-tile.
 //
 // d_inner_tile is chosen so that per-thread VTCM stays under the budget.
 // Each thread iterates ceil(d_inner_per_thread d_inner_tile) tiles serially.
 #define HTP_SSM_CONV_VTCM_BUDGET (1u << 20) // 1 MiB per thread

-// Scalar transpose: src1 {d_conv, d_inner} (DDR) -> {d_inner_per_thread, d_conv} (VTCM)
+// Scalar transpose: src1 {d_conv, d_inner} (DDR) -> {d_inner_stride, d_conv} (VTCM)
 static inline void transpose_src1(const float * src1_data,
                                  uint32_t      src1_stride_inner,
                                  uint32_t      i1_off,
                                  uint32_t      d_inner_per_thread,
+                                  uint32_t      d_inner_stride,
                                  uint32_t      d_conv,
                                  float *       src1_T) {
    for (uint32_t i = 0; i < d_inner_per_thread; ++i) {
        const float * src_row = src1_data + (i1_off + i) * src1_stride_inner;
        for (uint32_t j = 0; j < d_conv; ++j) {
-            src1_T[j * d_inner_per_thread + i] = src_row[j];
+            src1_T[j * d_inner_stride + i] = src_row[j];
        }
    }
 }
@ -280,6 +281,7 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
    }

    const uint32_t d_inner_per_thread = ir1 - ir0;
+    const uint32_t d_inner_stride     = scctx->nrows_per_thread;
    const uint32_t d_inner_tile       = scctx->d_inner_tile;

    const float * src0_data = (const float *) src0->data;
@ -290,8 +292,8 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
    float * src0_T = (float *)(octx->src0_spad.data + ith * octx->src0_spad.size_per_thread);
    float * src1_T = (float *)(octx->src1_spad.data + ith * octx->src1_spad.size_per_thread);

-    // Stage src1 weights once into VTCM in {d_inner_per_thread, d_conv} layout.
-    transpose_src1(src1_data, src1_stride_inner, ir0, d_inner_per_thread, d_conv, src1_T);
+    // Stage src1 weights once into VTCM in {d_inner_stride, d_conv} layout.
+    transpose_src1(src1_data, src1_stride_inner, ir0, d_inner_per_thread, d_inner_stride, d_conv, src1_T);

    const uint32_t C_TILE = VLEN_FP32;

@ -314,7 +316,7 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
                    HVX_Vector acc = hvx_vec_splat_f32(0.0f);
                    for (uint32_t j = 0; j < d_conv; ++j) {
                        HVX_Vector x = *(const HVX_Vector *) (src0_T + (t + j) * d_inner_tile + cb);
-                        HVX_Vector w = *(const HVX_Vector *) (src1_T + j * d_inner_per_thread + tile_off + cb);
+                        HVX_Vector w = *(const HVX_Vector *) (src1_T + j * d_inner_stride + tile_off + cb);
                        acc          = Q6_Vqf32_vadd_Vqf32Vqf32(acc, Q6_Vqf32_vmpy_VsfVsf(x, w));
                    }
                    HVX_Vector res = Q6_Vsf_equals_Vqf32(acc);
@ -362,8 +364,7 @@ int op_ssm_conv_f32(struct htp_ops_context * octx) {
            use_hvx = 1;
        }

-        scctx.nrows_per_thread  = (d_inner + n_threads - 1) / n_threads;
-        scctx.nrows_per_thread += (scctx.nrows_per_thread & 1);
+        scctx.nrows_per_thread = hex_round_up((d_inner + n_threads - 1) / n_threads, VLEN_FP32);

        const uint32_t d_inner_per_thread = scctx.nrows_per_thread;
        const uint32_t ncs                = src0->ne[0];
--- a/ggml/src/ggml-hexagon/libggml-htp.inf
+++ b/ggml/src/ggml-hexagon/libggml-htp.inf
@ -14,8 +14,6 @@ Drivers_Dir = 13
 1 = %DiskId%

 [SourceDisksFiles]
-libggml-htp-v68.so = 1
-libggml-htp-v69.so = 1
 libggml-htp-v73.so = 1
 libggml-htp-v75.so = 1
 libggml-htp-v79.so = 1
@ -28,8 +26,6 @@ ExcludeFromSelect = *
 CopyFiles=Drivers_Dir

 [Drivers_Dir]
-libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE
-libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE
 libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE
 libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE
 libggml-htp-v79.so,,,0x10 ;COPYFLG_NO_OVERWRITE
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@ -10152,14 +10152,8 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));

-    const int ne00 = src0 ? src0->ne[0] : 0;
-    const int ne01 = src0 ? src0->ne[1] : 0;
-    const int ne02 = src0 ? src0->ne[2] : 0;
-    const int ne03 = src0 ? src0->ne[3] : 0;
-
-    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
-    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
-    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+    GGML_TENSOR_LOCALS(int,      ne0, src0, ne);
+    GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);

    const int nth = MIN(64, ne00);

@ -10173,11 +10167,12 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),       &ne01));
    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),       &ne02));
    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),       &ne03));
-    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong),  &nb01));
-    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong),  &nb02));
-    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),  &nb03));
-    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float),     &eps));
-    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
+    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong),  &nb00));
+    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong),  &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),  &nb02));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),  &nb03));
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float),     &eps));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float)*nth, NULL));

    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
    size_t local_work_size[] = {(size_t)nth, 1, 1};
--- a/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl
@ -174,7 +174,7 @@ __kernel void kernel_gemv_noshuffle_q8_0_f32(
        regA.s6 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
        regA.s7 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;

-        dequantizeBlockAccum_ns_sgbroadcast_1(totalSum, regA, regS, regB);
+        dequantizeBlockAccum_ns_sgbroadcast_1(totalSum, regA, convert_float(regS), regB);
    }

    // reduction in local memory, assumes #wave=4
--- a/Show More
+++ b/Show More