openvino: Update to OV 2026.2.1, self-contained release packages, operator improvements (#24974)

* Update to OV 2026.2.1, Make OV release packages self-contained

* Update to OV 2026.2.1, Make OV release packages self-contained

* OpenVINO Backend: Remove compute_op_type hardcoded sets (#222)

* OpenVINO Backend: Remove compute_op_type hardcoded sets

* revert get_op_type removal

* OpenVINO backend: enable softmax with sink input

* OpenVINO backend: opt mul_mat_id convert process for large size

* OpenVINO backend: Modify add_id to support 2D/4D

* OpenVINO Backend: Add glu_swiglu_oai

* PR review: fix paths

* PR review: fix path consistency

---------

Co-authored-by: Mostafa <mostafas.main.email@gmail.com>
Co-authored-by: Xuejun <Xuejun.Zhai@intel.com>
This commit is contained in:
Ravi Panchumarthy 2026-06-26 05:07:19 -07:00 committed by GitHub
parent e7ea94afcb
commit 5397c36194
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 316 additions and 111 deletions

View File

@ -1,12 +1,12 @@
ARG OPENVINO_VERSION_MAJOR=2026.2
ARG OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857
ARG OPENVINO_VERSION_MAJOR=2026.2.1
ARG OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3
ARG UBUNTU_VERSION=24.04
# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
ARG IGC_VERSION=v2.34.4
ARG IGC_VERSION_FULL=2_2.34.4+21428
ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
ARG IGC_VERSION=v2.36.3
ARG IGC_VERSION_FULL=2_2.36.3+21719
ARG COMPUTE_RUNTIME_VERSION=26.22.38646.4
ARG COMPUTE_RUNTIME_VERSION_FULL=26.22.38646.4-0
ARG IGDGMM_VERSION=22.10.0
# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases

View File

@ -68,8 +68,8 @@ jobs:
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2"
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
OPENVINO_VERSION_MAJOR: "2026.2.1"
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
steps:
- name: Clone
@ -96,8 +96,8 @@ jobs:
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2"
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
OPENVINO_VERSION_MAJOR: "2026.2.1"
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
steps:
- name: Clone

View File

@ -39,8 +39,8 @@ jobs:
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2"
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
OPENVINO_VERSION_MAJOR: "2026.2.1"
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
steps:
- name: Clone
@ -96,8 +96,8 @@ jobs:
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2"
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
OPENVINO_VERSION_MAJOR: "2026.2.1"
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
steps:
- name: Clone

View File

@ -266,8 +266,8 @@ jobs:
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2"
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
OPENVINO_VERSION_MAJOR: "2026.2.1"
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
steps:
- name: Clone

View File

@ -446,8 +446,8 @@ jobs:
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2"
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
OPENVINO_VERSION_MAJOR: "2026.2.1"
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
steps:
- name: Set OpenVINO version output
@ -506,8 +506,11 @@ jobs:
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON \
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
cmake --build build/ReleaseOV --config Release -j $(nproc)
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
${{ env.CMAKE_ARGS }}
cmake --build build/ReleaseOV --config Release --parallel
- name: ccache-clear
uses: ./.github/actions/ccache-clear
@ -521,8 +524,26 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/ReleaseOV/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin .
dest=./build/ReleaseOV/bin
OPENVINO_ROOT=./openvino_toolkit
ov_lib="$OPENVINO_ROOT/runtime/lib/intel64"
# Bundle OpenVINO runtime libs + TBB. Binaries built with RPATH=$ORIGIN
# load these siblings without setupvars.sh / LD_LIBRARY_PATH.
cp -P "$ov_lib"/libopenvino.so* \
"$ov_lib"/libopenvino_c.so* \
"$ov_lib"/libopenvino_*_plugin.so \
"$ov_lib"/libopenvino_intel_npu_compiler*.so \
"$OPENVINO_ROOT"/runtime/3rdparty/tbb/lib/*.so* \
"$dest"
cp -P /usr/lib/x86_64-linux-gnu/libOpenCL.so.1* "$dest" 2>/dev/null || true
cp "$ov_lib"/cache.json "$dest" 2>/dev/null || true
# OpenVINO licensing
cp -r "$OPENVINO_ROOT"/docs/licensing "$dest"/openvino-licensing
cp LICENSE "$dest"
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C "$dest" .
- name: Upload artifacts
uses: actions/upload-artifact@v6
@ -538,8 +559,8 @@ jobs:
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2"
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
OPENVINO_VERSION_MAJOR: "2026.2.1"
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
steps:
- name: Set OpenVINO version output
@ -607,7 +628,9 @@ jobs:
-A x64 ^
-DCMAKE_BUILD_TYPE=Release ^
-DGGML_OPENVINO=ON ^
-DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
-DLLAMA_BUILD_BORINGSSL=ON ^
-DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake ^
${{ env.CMAKE_ARGS }}
cmake --build build\ReleaseOV --config Release -- /m
@ -624,8 +647,29 @@ jobs:
id: pack_artifacts
shell: powershell
run: |
Copy-Item LICENSE .\build\ReleaseOV\bin\
7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip .\build\ReleaseOV\bin\*
# Locate the extracted OpenVINO toolkit root (same pattern as the Build step).
$OPENVINO_ROOT = (Get-ChildItem -Directory openvino_toolkit | Select-Object -First 1).FullName
if (-not $OPENVINO_ROOT) {
Write-Error "OpenVINO toolkit folder not found under .\openvino_toolkit"
exit 1
}
$dest = ".\build\ReleaseOV\bin\Release"
$ovBin = Join-Path $OPENVINO_ROOT 'runtime\bin\intel64\Release'
Copy-Item -Path (Join-Path $ovBin '*.dll') -Destination $dest -Force
Copy-Item -Path (Join-Path $ovBin 'cache.json') -Destination $dest -Force
$tbbBin = Join-Path $OPENVINO_ROOT 'runtime\3rdparty\tbb\bin'
Copy-Item -Path (Join-Path $tbbBin 'tbb*.dll') -Destination $dest -Force
# OpenVINO licensing
$licensingDest = Join-Path $dest 'openvino-licensing'
New-Item -ItemType Directory -Force -Path $licensingDest | Out-Null
Copy-Item -Path (Join-Path $OPENVINO_ROOT 'docs\licensing\*') -Destination $licensingDest -Recurse -Force
Copy-Item LICENSE $dest
7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip $dest\*
- name: Upload artifacts
uses: actions/upload-artifact@v6

View File

@ -237,8 +237,8 @@ chmod +x ubuntu-llamacpp-ov-install.sh
# ============================================
set -euo pipefail
OPENVINO_VERSION_MAJOR="2026.2"
OPENVINO_VERSION_FULL="2026.2.0.21903.52ddc073857"
OPENVINO_VERSION_MAJOR="2026.2.1"
OPENVINO_VERSION_FULL="2026.2.1.21919.ede283a88e3"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
OPENVINO_INSTALL_DIR="/opt/intel/openvino_${OPENVINO_VERSION_MAJOR}"
@ -334,7 +334,7 @@ echo " ./build/ReleaseOV/bin/llama-cli -m model.gguf"
```
> [!NOTE]
> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release.
> The script pins OpenVINO `2026.2.1` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release.
</details>
@ -364,8 +364,8 @@ REM ============================================
REM llama.cpp OpenVINO Build Script (Ninja)
REM ============================================
set "OPENVINO_VERSION_MAJOR=2026.2"
set "OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857"
set "OPENVINO_VERSION_MAJOR=2026.2.1"
set "OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3"
set "SCRIPT_DIR=%~dp0"
set "VCPKG_DIR=C:\vcpkg"
@ -547,7 +547,7 @@ endlocal
```
> [!NOTE]
> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**.
> The script pins OpenVINO `2026.2.1` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**.
</details>

View File

@ -1270,77 +1270,14 @@ void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecode
}
std::string GgmlOvDecoder::compute_op_type(const ggml_tensor * node) {
static const std::map<ggml_op, std::string> ops = {
{GGML_OP_NONE, "GGML_OP_NONE" },
{GGML_OP_ACC, "GGML_OP_ACC" },
{GGML_OP_ADD, "GGML_OP_ADD" },
{GGML_OP_ADD1, "GGML_OP_ADD1" },
{GGML_OP_ADD_ID, "GGML_OP_ADD_ID" },
{GGML_OP_CONCAT, "GGML_OP_CONCAT" },
{GGML_OP_CONT, "GGML_OP_CONT" },
{GGML_OP_DIV, "GGML_OP_DIV" },
{GGML_OP_DUP, "GGML_OP_DUP" },
{GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" },
{GGML_OP_MUL, "GGML_OP_MUL" },
{GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" },
{GGML_OP_MUL_MAT_ID, "GGML_OP_MUL_MAT_ID" },
{GGML_OP_PERMUTE, "GGML_OP_PERMUTE" },
{GGML_OP_RESHAPE, "GGML_OP_RESHAPE" },
{GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" },
{GGML_OP_NORM, "GGML_OP_NORM" },
{GGML_OP_ROPE, "GGML_OP_ROPE" },
{GGML_OP_SCALE, "GGML_OP_SCALE" },
{GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" },
{GGML_OP_SUM_ROWS, "GGML_OP_SUM_ROWS" },
{GGML_OP_SUB, "GGML_OP_SUB" },
{GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE" },
{GGML_OP_VIEW, "GGML_OP_VIEW" },
{GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" },
{GGML_OP_CPY, "GGML_OP_CPY" },
{GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT" },
{GGML_OP_L2_NORM, "GGML_OP_L2_NORM" },
{GGML_OP_CLAMP, "GGML_OP_CLAMP" },
{GGML_OP_PAD, "GGML_OP_PAD" },
{GGML_OP_SSM_CONV, "GGML_OP_SSM_CONV" },
{GGML_OP_GATED_DELTA_NET, "GGML_OP_GATED_DELTA_NET"},
{GGML_OP_ARGSORT, "GGML_OP_ARGSORT" },
{GGML_OP_REPEAT, "GGML_OP_REPEAT" },
{GGML_OP_IM2COL, "GGML_OP_IM2COL" }
};
static const std::map<ggml_unary_op, std::string> unary_ops = {
{GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" },
{GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN" },
{GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG" },
{GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP" },
{GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH" },
{GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU" },
{GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU" },
{GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID" },
{GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU" },
{GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK" },
{GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU" },
{GGML_UNARY_OP_SOFTPLUS, "GGML_UNARY_OP_SOFTPLUS" },
{GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH" },
{GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
{GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP" },
{GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT" }
};
static const std::map<ggml_glu_op, std::string> glu_ops = {
{GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"},
{GGML_GLU_OP_GEGLU, "GGML_GLU_OP_GEGLU" },
{GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" }
};
switch (node->op) {
case GGML_OP_UNARY:
return unary_ops.at(ggml_get_unary_op(node));
return std::string("GGML_UNARY_OP_") + ggml_unary_op_name(ggml_get_unary_op(node));
case GGML_OP_GLU:
return glu_ops.at(ggml_get_glu_op(node));
return std::string("GGML_GLU_OP_") + ggml_glu_op_name(ggml_get_glu_op(node));
default:
return ops.at(node->op);
return std::string("GGML_OP_") + ggml_op_name(node->op);
}
static const std::string unknown_op = "UNKNOWN_GGML_OP";
return unknown_op;
}
const std::string & GgmlOvDecoder::get_op_type(int node_idx) const {

View File

@ -17,6 +17,22 @@ namespace frontend {
namespace ggml {
namespace op {
static ov::Output<ov::Node> reshape_add_id_input_to_2d(const ov::Output<ov::Node> & input,
const ov::PartialShape & input_shape,
const std::vector<int> & dims) {
const auto actual_shape = input.get_partial_shape();
if (actual_shape.rank().is_static() && actual_shape.rank().get_length() == 2) {
return input;
}
if (input_shape.rank().is_static() && input_shape.rank().get_length() == 2) {
return input;
}
auto shape = std::make_shared<ov::op::v3::ShapeOf>(input, ov::element::i64);
return std::make_shared<ov::op::v1::Reshape>(input, get_dimensions(shape, dims), false);
}
OutputVector translate_add_id(const NodeContext & context) {
num_inputs_check(context, 3, 3);
@ -28,11 +44,9 @@ OutputVector translate_add_id(const NodeContext & context) {
// input: [1, n_token, n_used, n_embd]
// bias: [1, 1, n_expert, n_embd]
// ids: [1, 1, n_token, n_used]
auto bias_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(bias, ov::element::i64);
auto ids_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
bias = std::make_shared<ov::op::v1::Reshape>(bias, get_dimensions(bias_shape_4d, {2, 3}), false);
ids = std::make_shared<ov::op::v1::Reshape>(ids, get_dimensions(ids_shape_4d, {2, 3}), false);
// Model bias constants may already be stored as [n_expert, n_embd].
bias = reshape_add_id_input_to_2d(bias, context.get_input_shape(1), {2, 3});
ids = reshape_add_id_input_to_2d(ids, context.get_input_shape(2), {2, 3});
if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) {
ids = std::make_shared<ov::op::v0::Convert>(ids, ov::element::i32);

View File

@ -3,8 +3,11 @@
#include "../utils.h"
#include <cstdint>
#include <limits>
#include <memory>
#include <openvino/core/node_output.hpp>
#include <openvino/op/add.hpp>
#include <openvino/op/clamp.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/sigmoid.hpp>
@ -15,7 +18,7 @@ namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_glu_swiglu(const NodeContext & context) {
static std::pair<ov::Output<ov::Node>, ov::Output<ov::Node>> get_glu_inputs(const NodeContext & context) {
num_inputs_check(context, 1, 2);
ov::Output<ov::Node> src0;
@ -52,6 +55,12 @@ OutputVector translate_glu_swiglu(const NodeContext & context) {
std::swap(src0, src1);
}
return {src0, src1};
}
OutputVector translate_glu_swiglu(const NodeContext & context) {
auto [src0, src1] = get_glu_inputs(context);
auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(src0);
auto silu = std::make_shared<ov::op::v1::Multiply>(src0, sigmoid);
auto res = std::make_shared<ov::op::v1::Multiply>(silu, src1);
@ -59,6 +68,27 @@ OutputVector translate_glu_swiglu(const NodeContext & context) {
return rename_outputs_with_suffix({res}, context.get_name());
}
OutputVector translate_glu_swiglu_oai(const NodeContext & context) {
auto [src0, src1] = get_glu_inputs(context);
const int32_t * params = context.get_output_op_params();
const float alpha = reinterpret_cast<const float *>(params)[2];
const float limit = reinterpret_cast<const float *>(params)[3];
auto gate = std::make_shared<ov::op::v0::Clamp>(src0, -std::numeric_limits<float>::infinity(), limit);
auto alpha_const = ov::op::v0::Constant::create(ov::element::f32, {}, {alpha});
auto scaled_gate = std::make_shared<ov::op::v1::Multiply>(gate, alpha_const);
auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(scaled_gate);
auto out_glu = std::make_shared<ov::op::v1::Multiply>(gate, sigmoid);
auto up = std::make_shared<ov::op::v0::Clamp>(src1, -limit, limit);
auto one = ov::op::v0::Constant::create(ov::element::f32, {}, {1.0f});
auto up_plus_one = std::make_shared<ov::op::v1::Add>(up, one);
auto res = std::make_shared<ov::op::v1::Multiply>(out_glu, up_plus_one);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend

View File

@ -2,23 +2,135 @@
#include "../op_table.h"
#include "../utils.h"
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <openvino/op/bitwise_and.hpp>
#include <openvino/op/bitwise_right_shift.hpp>
#include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/gather.hpp>
#include <openvino/op/matmul.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/shape_of.hpp>
#include <openvino/op/squeeze.hpp>
#include <openvino/op/slice.hpp>
#include <openvino/op/unsqueeze.hpp>
#include <vector>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
namespace {
std::shared_ptr<ov::op::v0::Constant> const_i64(const std::vector<int64_t> & values) {
return ov::op::v0::Constant::create(ov::element::i64, ov::Shape{values.size()}, values);
}
ov::Output<ov::Node> slice_axis(const ov::Output<ov::Node> & input, int64_t axis, int64_t begin, int64_t end) {
return std::make_shared<ov::op::v8::Slice>(input, const_i64({begin}), const_i64({end}), const_i64({1}),
const_i64({axis}));
}
ov::Output<ov::Node> translate_mul_mat_id_mxfp4_packed(const NodeContext & context,
ov::Output<ov::Node> expert_weights,
ov::Output<ov::Node> activations,
ov::Output<ov::Node> ids) {
auto packed_shape = expert_weights.get_partial_shape().to_shape();
FRONT_END_OP_CONVERSION_CHECK(packed_shape.size() == 5 && packed_shape[4] == 17,
"Expected packed MXFP4 expert weights with shape [1, n_expert, m, k_blocks, 17]");
const int64_t n_expert = static_cast<int64_t>(packed_shape[1]);
const int64_t rows = static_cast<int64_t>(packed_shape[2]);
const int64_t k_blocks = static_cast<int64_t>(packed_shape[3]);
const int64_t qk = 32;
const int64_t cols = k_blocks * qk;
auto packed_shape_4d = const_i64({n_expert, rows, k_blocks, 17});
expert_weights = std::make_shared<ov::op::v1::Reshape>(expert_weights, packed_shape_4d, false);
auto activations_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(activations, ov::element::i64);
auto ids_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
auto activations_shape_3d = get_dimensions(activations_shape_4d, {1, 2, 3});
auto ids_shape_2d = get_dimensions(ids_shape_4d, {2, 3});
activations = std::make_shared<ov::op::v1::Reshape>(activations, activations_shape_3d, false);
ids = std::make_shared<ov::op::v1::Reshape>(ids, ids_shape_2d, false);
if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) {
ids = std::make_shared<ov::op::v0::Convert>(ids, ov::element::i32);
}
auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
static const std::vector<float> f4e2m1_lut = {0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f,
-0.0f, -0.5f, -1.0f, -1.5f, -2.0f, -3.0f, -4.0f, -6.0f};
std::vector<float> e8m0_lut(256);
for (size_t i = 0; i < e8m0_lut.size(); ++i) {
uint32_t bits = static_cast<uint32_t>(i) << 23;
memcpy(&e8m0_lut[i], &bits, sizeof(float));
}
e8m0_lut[0] = std::numeric_limits<float>::min() / 2.0f;
e8m0_lut[255] = std::numeric_limits<float>::quiet_NaN();
auto f4_lut = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{f4e2m1_lut.size()}, f4e2m1_lut);
auto scale_lut = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{e8m0_lut.size()}, e8m0_lut);
auto selected_packed_weights = std::make_shared<ov::op::v8::Gather>(expert_weights, ids, gather_axis);
auto scale_byte = slice_axis(selected_packed_weights, 4, 0, 1);
auto qs = slice_axis(selected_packed_weights, 4, 1, 17);
auto low = std::make_shared<ov::op::v13::BitwiseAnd>(
qs, ov::op::v0::Constant::create(ov::element::u8, ov::Shape{}, {0x0F}), ov::op::AutoBroadcastType::NUMPY);
auto high_shift = std::make_shared<ov::op::v15::BitwiseRightShift>(
qs, ov::op::v0::Constant::create(ov::element::u8, ov::Shape{}, {4}), ov::op::AutoBroadcastType::NUMPY);
auto nibbles = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{low, high_shift}, 4);
auto nibble_indices = std::make_shared<ov::op::v0::Convert>(nibbles, ov::element::i32);
auto weights_f32 = std::make_shared<ov::op::v8::Gather>(f4_lut, nibble_indices, gather_axis);
auto scale_indices = std::make_shared<ov::op::v0::Convert>(scale_byte, ov::element::i32);
auto scales_f32 = std::make_shared<ov::op::v8::Gather>(scale_lut, scale_indices, gather_axis);
ov::Output<ov::Node> selected_weights = std::make_shared<ov::op::v1::Multiply>(weights_f32, scales_f32,
ov::op::AutoBroadcastType::NUMPY);
auto ids_shape = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
auto selected_weights_target_dims = std::make_shared<ov::op::v0::Concat>(
ov::OutputVector{get_dimensions(ids_shape, {0, 1}), const_i64({rows, cols})}, 0);
selected_weights = std::make_shared<ov::op::v1::Reshape>(selected_weights, selected_weights_target_dims, false);
auto activations_shape = std::make_shared<ov::op::v3::ShapeOf>(activations, ov::element::i64);
ov::Output<ov::Node> acts_target_dims = std::make_shared<ov::op::v0::Concat>(
ov::OutputVector{
get_dimensions(activations_shape, {0}),
get_dimensions(ids_shape, {1}),
get_dimensions(activations_shape, {2}),
},
0);
ov::Output<ov::Node> acts_broadcasted =
std::make_shared<ov::op::v3::Broadcast>(activations, acts_target_dims, ov::op::BroadcastType::BIDIRECTIONAL);
auto activations_expanded = std::make_shared<ov::op::v0::Unsqueeze>(acts_broadcasted, const_i64({2}));
ov::Output<ov::Node> result =
std::make_shared<ov::op::v0::MatMul>(activations_expanded, selected_weights, false, true);
auto batch_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto row_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {rows});
auto result_target_dims = std::make_shared<ov::op::v0::Concat>(
ov::OutputVector{batch_dim, get_dimensions(ids_shape, {0, 1}), row_dim}, 0);
result = std::make_shared<ov::op::v1::Reshape>(result, result_target_dims, false);
const auto output_type = context.get_output_type();
if (result.get_element_type() != output_type) {
result = std::make_shared<ov::op::v0::Convert>(result, output_type);
}
return result;
}
} // namespace
OutputVector translate_mul_mat_id(const NodeContext & context) {
num_inputs_check(context, 3, 3);
@ -26,6 +138,12 @@ OutputVector translate_mul_mat_id(const NodeContext & context) {
auto activations = process_view_input_new(context, 1);
auto ids = process_view_input_new(context, 2);
if (expert_weights.get_element_type() == ov::element::u8 && expert_weights.get_partial_shape().rank().is_static() &&
expert_weights.get_partial_shape().rank().get_length() == 5) {
return rename_outputs_with_suffix({translate_mul_mat_id_mxfp4_packed(context, expert_weights, activations, ids)},
context.get_name());
}
// OpenVINO sees GGML tensors in reversed dimension order:
// weights: [1, n_expert, m, k]
// activations: [1, n_tokens, n_used_or_1, k]

View File

@ -6,12 +6,16 @@
#include <cstdint>
#include <cstring>
#include <memory>
#include <openvino/op/broadcast.hpp>
#include <openvino/frontend/exception.hpp>
#include <openvino/op/add.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/shape_of.hpp>
#include <openvino/op/slice.hpp>
#include <openvino/op/softmax.hpp>
#include <vector>
@ -20,12 +24,31 @@ namespace frontend {
namespace ggml {
namespace op {
static bool is_static_one(const ov::Dimension & dim) {
return dim.is_static() && dim.get_length() == 1;
}
static bool same_static_dim(const ov::Dimension & lhs, const ov::Dimension & rhs) {
return lhs.is_static() && rhs.is_static() && lhs.get_length() == rhs.get_length();
}
static bool is_attention_sinks_input_shape(const ov::PartialShape & candidate, const ov::PartialShape & logits_shape) {
if (candidate.rank().is_dynamic() || logits_shape.rank().is_dynamic() || candidate.rank().get_length() != 4 ||
logits_shape.rank().get_length() != 4) {
return false;
}
return is_static_one(candidate[0]) && is_static_one(candidate[1]) && is_static_one(candidate[2]) &&
same_static_dim(candidate[3], logits_shape[1]);
}
// Reimplementation of GGML_OP_SOFT_MAX semantics for OpenVINO backend:
// 1) logits = src0 * scale
// 2) logits += mask (if provided)
// 3) softmax over the last dimension
// 3) append attention sinks as hidden logits (if provided)
// 4) softmax over the last dimension and remove the hidden sink column
OutputVector translate_soft_max(const NodeContext & context) {
num_inputs_check(context, 1, 2);
num_inputs_check(context, 1, 3);
float scale = 1.0f;
float max_bias = 0.0f;
@ -33,6 +56,11 @@ OutputVector translate_soft_max(const NodeContext & context) {
memcpy(&max_bias, (float *) context.get_output_op_params() + 1, sizeof(float));
ov::Output<ov::Node> logits = context.get_input(0);
const bool second_input_is_sinks =
context.get_input_size() == 2 && is_attention_sinks_input_shape(context.get_input_shape(1), context.get_output_shape());
const bool has_mask = context.get_input_size() > 1 && !second_input_is_sinks;
const bool has_sinks = second_input_is_sinks || context.get_input_size() > 2;
const size_t sinks_input_idx = second_input_is_sinks ? 1 : 2;
// Apply scale first: logits = src0 * scale
if (scale != 1.0f) {
@ -41,12 +69,12 @@ OutputVector translate_soft_max(const NodeContext & context) {
logits = std::make_shared<ov::op::v1::Multiply>(logits, scale_const);
}
FRONT_END_CHECK_IMPLEMENTED(!(max_bias > 0.0f && context.get_input_size() < 2),
FRONT_END_CHECK_IMPLEMENTED(!(max_bias > 0.0f && !has_mask),
"OpenVINO softmax ALiBi path requires mask input");
// Optional mask add: logits += mask
// For max_bias > 0 (ALiBi), apply per-head slope to mask before adding.
if (context.get_input_size() > 1) {
if (has_mask) {
ov::Output<ov::Node> mask = context.get_input(1);
// For stateful
@ -94,8 +122,40 @@ OutputVector translate_soft_max(const NodeContext & context) {
logits = std::make_shared<ov::op::v1::Add>(logits, mask);
}
ov::Output<ov::Node> softmax_input = logits;
if (has_sinks) {
ov::Output<ov::Node> sinks = context.get_input(sinks_input_idx);
if (sinks.get_element_type() != logits.get_element_type()) {
sinks = std::make_shared<ov::op::v0::Convert>(sinks, logits.get_element_type());
}
auto sink_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, {1, -1, 1, 1});
auto sinks_4d = std::make_shared<ov::op::v1::Reshape>(sinks, sink_shape, false);
auto logits_shape = std::make_shared<ov::op::v3::ShapeOf>(logits, ov::element::i64);
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
auto four = ov::op::v0::Constant::create(ov::element::i64, {1}, {4});
auto shape_axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto sink_prefix_shape = std::make_shared<ov::op::v8::Slice>(logits_shape, zero, three, one, shape_axis);
auto sink_last_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto sink_broadcast_shape = std::make_shared<ov::op::v0::Concat>(
ov::OutputVector{sink_prefix_shape, sink_last_dim}, 0);
auto sink_column = std::make_shared<ov::op::v3::Broadcast>(sinks_4d, sink_broadcast_shape,
ov::op::BroadcastType::BIDIRECTIONAL);
softmax_input = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{logits, sink_column}, 3);
auto softmax_with_sink = std::make_shared<ov::op::v8::Softmax>(softmax_input, -1);
auto original_last_dim = std::make_shared<ov::op::v8::Slice>(logits_shape, three, four, one, shape_axis);
auto res = std::make_shared<ov::op::v8::Slice>(softmax_with_sink, zero, original_last_dim, one, three);
return rename_outputs_with_suffix({res}, context.get_name());
}
// Softmax along last dimension (equivalent to ggml softmax over ne[0]).
auto res = std::make_shared<ov::op::v8::Softmax>(logits, -1);
auto res = std::make_shared<ov::op::v8::Softmax>(softmax_input, -1);
return rename_outputs_with_suffix({res}, context.get_name());
}

View File

@ -47,6 +47,7 @@ std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
{"GGML_UNARY_OP_TANH", op::translate_1to1_match_1_input<v0::Tanh> },
{"GGML_OP_VIEW", op::translate_view },
{"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu },
{"GGML_GLU_OP_SWIGLU_OAI", op::translate_glu_swiglu_oai },
{"GGML_GLU_OP_GEGLU", op::translate_glu_geglu },
{"GGML_OP_SET_ROWS", op::translate_set_rows },
{"GGML_OP_CPY", op::translate_cpy },

View File

@ -32,6 +32,7 @@ GGML_OP_CONVERTER(translate_soft_max);
GGML_OP_CONVERTER(translate_transpose);
GGML_OP_CONVERTER(translate_view);
GGML_OP_CONVERTER(translate_glu_swiglu);
GGML_OP_CONVERTER(translate_glu_swiglu_oai);
GGML_OP_CONVERTER(translate_glu_geglu);
GGML_OP_CONVERTER(translate_set_rows);
GGML_OP_CONVERTER(translate_cpy);