mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
* Add interface is_model_splitted() to check the c-graph is splited or not * Infer and propagate dynamic-dimension indices for all tensors in the GGML graph in api compute_model_outputs() * Only do this for fallback sub graph * Move dynamic dims compute in graph missmatch * ggml-openvino: fix tensor data handling for PERMUTE/VIEW ops in split models * ggml-openvino:add comments * ggml-openvino: override VIEW op_case to 0 for split model inputs * openvino backend: Handle unsupported VIEW shape-mismatch in OpenVINO backend * Enable additional mul_mat tests and add tensor data saving function (#81) * ggml-openvino: fix CONT/TRANSPOSE mapping and improve dynamic-dimension handling * OpenVINO: add NORM/TANH support and rework SOFT_MAX translation * ggml-openvino: extend VIEW handling * Enable -fa off (#118) * Enable --context-shift * Fix llm param compute error for normal softmax not the softmax in attention * OpenVINO backend: fix error for attention size compute in llm param * use tensor->extra in infer_request i/o * OpenVINO backend: refacter the compute_llm_params() func add get_attention_pattern_case to easy extand * OpenVINO backend: clean unused code * 1to1 match op update (#146) * added translate_1to1_match_1_input function and updated gelu and tanh translations * Remove unused translation function calls --------- Co-authored-by: Mustafa Cavus <mustafacavus@intel.com> * initial gemma4 support * removed hardcoded names for kv cache slicing * OpenVINO backend: Add new attention pattern for llm parameters compute * flash attn Q shape static conversion * Remove slice in permute translation when n_seq is 1 * return optional in extract_layer_from_name * OpenVINO backend: refactor VIEW related operation (#148) * OpenVINO backend: refactor VIEW related operation * Enable VIEW handling in following ops * OpenVINO backend does not support GGML_OP_NORM & GGML_OP_L2_NORM with VIEW input accuracy issue from OpenVINO * OpenVINO backend: Add ops l2_norm & pad * OpenVINO backend does not support CPY with non-contiguous data or mismatched types * add op SSM_CONV GATED_DELTA_NET * OpenVINO backend: fix error for bf16 in OV gpu plugin * reverted static Q input shape for attention layer * OpenVINO backend: remove hardcode name inp_tokens, which ignore some leaf case * Disable remote tensor due to bug in ov gpu * Disable n_token > 1 GATED_DELTA_NET on gpu * OpenVINO backend: fix the view op dynamic handling issue in gemma4 & enable view + get_row * OpenVINO backend: clean code * OpenVINO backend: enable view + norm/rms_norm * OpenVINO backend: concat op * OpenVINO backend: argsort op * OpenVINO backend: enable unary + view & GGML_UNARY_OP_SOFTPLUS * Fix issue for test-backend-ops in TOPK_MOE, which compare VIEW ops result, VIEW node in OpenVINO no need compare, the whole graph result is correct * OpenVINO backend: enable sum_rows * OpenVINO backend: enable clamp * OpenVINO backend: enable DIV * OpenVINO backend: enable GGML_OP_MUL_MAT_ID * OpenVINO backend: disable MUL_MAT_ID_FUSION case with large mem needed * OpenVINO backend: Disable GGML_OP_ARGSORT, cause test_backend-ops failed * OpenVINO backend: fix issue in mul_mat_id * OpenVINO backend: Disable DIV with broadcast on GPU * OpenVINO backend: update DIV * use ov internal op GatedDeltaNet * OpenVINO backend: enable llama erch test qwen3next * OpenVINO backend: enable RMS_NORM + VIEW & remove op_case 2 for rope * OpenVINO backend: fix error * suggested changes, need review * suggested changes, need review * OpenVINO backend: clean unused code & fix build warning * OpenVINO backend: enable minicpm3 for arch test * Disable GDN op (#177) * disable gated_delta_net * update stateful_kv_size correctly in mismatch case * OpenVINO backend: enable arch test for qwen3vl * OpenVINO backend: enable cohere2 for arch test * OpenVINO backend: enable t5 for arch test * OpenVINO backend: enable jamba for arch test * OpenVINO backend: remove warning for tmp * OpenVINO backend: enable kimi-linear for arch test * Remove unused * Fix gpt-oss accuracy issue * OpenVINO backend: enable arctic for arch test * OpenVINO backend: enable grok for arch test * Gemma4 initial npu support (#179) * Initiall gemma4 npu support * temp. fix for gemma4 accuracy bug on npu * Remove hardcoded names for npu-fold handling * revert static n tokens for cont translation as it is not needed * removed unused variable * ggml-openvino: add GGML_OPENVINO_ENABLE_CACHE env var to control decoder cache. Add environment variable GGML_OPENVINO_ENABLE_CACHE (default: YES). When set to NO, the decoder_cache is bypassed and models are rebuilt from the cgraph on every inference call in both dynamic and static compute paths. This is useful for debugging and verifying correctness without caching interference. * Revert "Gemma4 initial npu support (#179)" This reverts commit 0d29a9c4a52dc2c8aa52990f1a3854cfb01768ad. * OpenVINO backend: disable debug log print * Update TBB discovery. Delegated to OpenVINOs own config. * OpenVINO backend: GGML_OPENVINO_ENABLE_CACHE YES -> 1 * OpenVINO backend: fallback FLASH_ATTN_EXT in gemma3n to CPU backend * Add raw ov infer profiling metric * Add OV raw infer time metric to static compute path Co-authored-by: virajwad <84867530+virajwad@users.noreply.github.com> * Modify precision of static profiling * update to OV 2026.2, add OV windows CI * fix editorconfig-checks * Initiall gemma4 npu support * temp. fix for gemma4 accuracy bug on npu * Remove hardcoded names for npu-fold handling * revert static n tokens for cont translation as it is not needed * removed unused variable * test-llama-archs fix * Fix gemma4 flash_attn fallback * support im2col * fix code style * disable add_rope_sin_cos optimization * stateless boradcast and rope optimizations * Enable manual gqa attn by default for stateless gpu * manual gqa: fixed static batch * gemma4 llama-bench ctx update fix * Update OV win CI * stateful rope fusion temp. fix * OpenVINO backend: Conslolidate supported ops * Exclude unsupported GGML_OP_SUB cases * Exclude unsupported TOPK_MOE cases * OpenVINO Backend: MUL_MAT enhancements * Update OV CI * support f16 mask input for npu * Make GGML_OPENVINO_* env vars usage uniform Standardize all GGML_OPENVINO_* env flags: positive integers >0 to enable. Unset, empty, =0, or non-numeric values to disable. This fixes cases where text values or empty strings enabled features. * OpenVINO backend: Enhance envvar handling * more cleanup * move ggml_openvino_env_flag to appropriate place * OpenVINO backend: add REPEAT translator, Q5_1 weights, and GLU view-input fix * ggml-openvino: fix -Werror=cast-qual in extract_q5_1_data * Update openvino.Dockerfile Use BuildKit cache mounts for faster Docker rebuilds. Use apt instead of dpkg, remove unused .ddeb downloads, add DLLAMA_BUILD_TESTS=OFF. * ggml-openvino: centralize env var access via *getenv_str/getenv_int helpers Replace getenv and legacy flags with _str and _int helpers.Minor cleanup, doc updates. * OpenVINO backend: Enable GGML_OP_ADD_ID * Uptade openvino backend clamg-format * clang-format * Update OPENVINO.md (#211) * OpenVINO backend: fix accuracy issue for op CONCAT with i64 precision * Remove strict concurrency for gpu-openvino-low-perf * Update openvino CI keynames; add ccache-clear * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <1629204+CISC@users.noreply.github.com> * Fix formatting --------- Co-authored-by: Xuejun Zhai <Xuejun.Zhai@intel.com> Co-authored-by: Mustafa Cavus <mustafa.cavus@intel.com> Co-authored-by: Mustafa Cavus <mustafacavus@intel.com> Co-authored-by: Xuejun <XuejunZhai@intel.com> Co-authored-by: Wang Yang <yang4.wang@intel.com> Co-authored-by: Ravi Panchumarthy <ravi.panchumarthy@intel.com> Co-authored-by: virajwad <84867530+virajwad@users.noreply.github.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Mostafa Faheem <mostafaaafaheem@gmail.com> Co-authored-by: Sigbjørn Skjæret <1629204+CISC@users.noreply.github.com>
1652 lines
69 KiB
C++
1652 lines
69 KiB
C++
#include "ggml-decoder.h"
|
|
|
|
#include "ggml-impl.h"
|
|
#include "ggml-openvino-extra.h"
|
|
#include "ggml-openvino.h"
|
|
#include "ggml-quants.h"
|
|
#include "ggml.h"
|
|
#include "utils.h"
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
#include <fstream>
|
|
#include <iomanip>
|
|
#include <map>
|
|
#include <memory>
|
|
#include <openvino/core/dimension.hpp>
|
|
#include <openvino/core/except.hpp>
|
|
#include <openvino/core/node.hpp>
|
|
#include <openvino/core/partial_shape.hpp>
|
|
#include <openvino/core/type/bfloat16.hpp>
|
|
#include <openvino/core/type/element_type.hpp>
|
|
#include <openvino/core/type/float16.hpp>
|
|
#include <openvino/op/constant.hpp>
|
|
#include <openvino/op/convert.hpp>
|
|
#include <openvino/op/parameter.hpp>
|
|
#include <openvino/runtime/tensor.hpp>
|
|
#include <ostream>
|
|
#include <set>
|
|
#include <stdexcept>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
|
|
ModelParams & model_params,
|
|
ComputeParams & compute_params,
|
|
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
|
|
bool is_static,
|
|
bool is_stateful,
|
|
bool model_is_splitted,
|
|
bool is_prefill,
|
|
int prefill_chunk_size) :
|
|
m_is_static(is_static),
|
|
m_is_stateful(is_stateful),
|
|
m_is_prefill(is_prefill),
|
|
m_naive(false),
|
|
m_prefill_chunk_size(prefill_chunk_size),
|
|
m_model_is_splitted(model_is_splitted),
|
|
m_cgraph(cgraph),
|
|
m_model_weights(model_weights),
|
|
m_model_params(model_params),
|
|
m_compute_params(compute_params) {
|
|
static bool printed_address_map = false;
|
|
if (!printed_address_map) {
|
|
if (ggml_openvino_getenv_int("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
|
|
printed_address_map = true;
|
|
print_tensor_address_map(cgraph);
|
|
}
|
|
}
|
|
|
|
validate_cgraph();
|
|
|
|
set_input_output();
|
|
compute_node_dynamic_dims();
|
|
compute_model_inputs();
|
|
compute_model_outputs();
|
|
|
|
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
|
m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
|
|
m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
|
|
}
|
|
|
|
add_extra_inputs();
|
|
}
|
|
|
|
void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
|
|
m_cgraph = cgraph;
|
|
m_model_inputs.clear();
|
|
m_model_outputs.clear();
|
|
m_node_info_list.clear();
|
|
set_input_output();
|
|
compute_model_inputs();
|
|
compute_model_outputs();
|
|
}
|
|
|
|
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
|
|
m_cgraph = cgraph;
|
|
m_model_weights = model_weights;
|
|
m_naive = true;
|
|
set_input_output();
|
|
compute_model_inputs();
|
|
compute_model_outputs();
|
|
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
|
m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
|
|
m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
|
|
}
|
|
}
|
|
|
|
void GgmlOvDecoder::set_input_output() {
|
|
for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
|
|
auto node = m_cgraph->nodes[node_n];
|
|
|
|
NodeInfo current_node_info;
|
|
auto node_name = std::string(node->name);
|
|
auto node_output_name = node_name;
|
|
auto * node_output = node;
|
|
if (node->op == GGML_OP_SET_ROWS) {
|
|
// SET_ROWS updates the tensor in place. For later ov op that uses the
|
|
// the view_src of SET_ROWS, we need to make sure they get the updated tensor
|
|
// by putting the view_src name in the tensor_map in
|
|
// <openvino>/src/frontends/ggml/src/translate_session.cpp
|
|
node_output_name = std::string(node->view_src->name);
|
|
node_output = node->view_src;
|
|
}
|
|
|
|
current_node_info.node = node;
|
|
current_node_info.node_name = node_name;
|
|
current_node_info.node_output = node_output;
|
|
current_node_info.node_output_name = node_output_name;
|
|
current_node_info.node_op_case = 0;
|
|
current_node_info.data_addr = node->data;
|
|
|
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
auto * src = node->src[i];
|
|
if (src == nullptr) {
|
|
continue;
|
|
}
|
|
auto src_name = std::string(src->name);
|
|
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
|
src_name = get_graph_input_ov_name(src, node);
|
|
}
|
|
current_node_info.node_inputs[src_name] = src;
|
|
current_node_info.node_inputs_names.push_back(src_name);
|
|
|
|
if (src->op == GGML_OP_VIEW) {
|
|
// Traverse upward through nested VIEW operations
|
|
std::remove_reference_t<decltype(current_node_info.node_inputs_views[src_name])> view_chain;
|
|
auto current = src;
|
|
|
|
while (current != nullptr) {
|
|
auto current_name = std::string(current->name);
|
|
if (current->flags & GGML_TENSOR_FLAG_INPUT) {
|
|
current_name = get_graph_input_ov_name(current, node);
|
|
}
|
|
view_chain.emplace_back(current_name, current);
|
|
// If current src is also a VIEW, continue traversing
|
|
if (current->src[0] != nullptr && current->src[0]->op == GGML_OP_VIEW) {
|
|
current = current->src[0];
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Assign all collected view inputs to node_inputs_views
|
|
current_node_info.node_inputs_views[src_name] = view_chain;
|
|
}
|
|
}
|
|
|
|
m_node_info_list.push_back(current_node_info);
|
|
}
|
|
}
|
|
|
|
int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
|
|
int op_case = 0;
|
|
switch (node->op) {
|
|
case GGML_OP_RESHAPE: {
|
|
auto * src = node->src[0];
|
|
if (src->op == GGML_OP_RESHAPE && src->src[0]->ne[0] == node->ne[0] && src->src[0]->ne[1] == node->ne[1]) {
|
|
op_case = 4;
|
|
} else if (node->ne[0] * node->ne[1] == src->ne[0]) {
|
|
op_case = 1;
|
|
} else if (src->ne[0] * src->ne[1] == node->ne[0]) {
|
|
op_case = 2;
|
|
if (src->ne[2] * src->ne[3] == node->ne[1]) {
|
|
op_case = 5;
|
|
}
|
|
} else if (src->ne[0] * src->ne[1] * src->ne[2] == node->ne[1]) {
|
|
op_case = 3;
|
|
} else if (src->ne[1] * src->ne[2] == node->ne[1]) {
|
|
op_case = 6;
|
|
}
|
|
if (op_case == 0 && ggml_nelements(node) == ggml_nelements(src)) {
|
|
op_case = 6;
|
|
}
|
|
break;
|
|
}
|
|
case GGML_OP_PERMUTE: {
|
|
if (node->src[0]->op != GGML_OP_VIEW) {
|
|
op_case = 1;
|
|
} else if (node->src[0]->src[0]->op == GGML_OP_NONE) {
|
|
// kv cache tensor
|
|
std::string src_name(node->view_src->name);
|
|
int layer = extract_layer_from_name(src_name).value();
|
|
if (ggml_is_contiguous(node->src[0])) {
|
|
// - 19: [ 64, 8, 256, 1] VIEW cache_k_l0 (view) [ 2, 128, 1024, 1048576]
|
|
// [ 512, 1024, 1, 1] 0: NONE cache_k_l0 [ 2, 1024, 1048576, 1048576]
|
|
// - 20: [ 64, 256, 8, 1] PERMUTE cache_k_l0 (view) (permuted) [ 2, 1024, 128, 1048576]
|
|
// [ 64, 8, 256, 1] 0: VIEW cache_k_l0 (view) [ 2, 128, 1024, 1048576]
|
|
if (!is_swa_layer(layer)) {
|
|
op_case = 3;
|
|
} else {
|
|
op_case = 4;
|
|
}
|
|
} else {
|
|
// special case of cache v when `-fa off`
|
|
// - 17: [ 256, 8, 64, 1] VIEW cache_v_l0 (view) [ 2, 131072, 2048, 1048576]
|
|
// [ 512, 1024, 1, 1] 0: NONE cache_v_l0 [ 2, 1024, 1048576, 1048576]
|
|
// - 18: [ 256, 64, 8, 1] PERMUTE cache_v_l0 (view) (permuted) [ 2, 2048, 131072, 1048576]
|
|
// [ 256, 8, 64, 1] 0: VIEW cache_v_l0 (view) [ 2, 131072, 2048, 1048576]
|
|
if (!is_swa_layer(layer)) {
|
|
op_case = 5;
|
|
} else {
|
|
op_case = 6;
|
|
}
|
|
}
|
|
} else {
|
|
// rope'ed query tensor
|
|
op_case = 2;
|
|
}
|
|
break;
|
|
}
|
|
case GGML_OP_MUL_MAT: {
|
|
if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) {
|
|
op_case = 3;
|
|
} else if (node->src[1]->op == GGML_OP_SOFT_MAX) {
|
|
// In the case of `-fa off`, softmax is used, v_trans=true, the dynamic dim is ne[0] for cache_v
|
|
op_case = 2;
|
|
}
|
|
break;
|
|
}
|
|
case GGML_OP_GET_ROWS: {
|
|
if (node->src[1]->op == GGML_OP_VIEW) {
|
|
op_case = 2;
|
|
}
|
|
break;
|
|
}
|
|
case GGML_OP_ROPE: {
|
|
const int mode = node->op_params[2];
|
|
switch (mode) {
|
|
case GGML_ROPE_TYPE_NEOX: {
|
|
op_case = 1;
|
|
break;
|
|
}
|
|
case GGML_ROPE_TYPE_IMROPE: {
|
|
op_case = 2;
|
|
break;
|
|
}
|
|
default:
|
|
op_case = 0;
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
case GGML_OP_VIEW: {
|
|
if (node->src[0]->op == GGML_OP_VIEW) {
|
|
auto * src = node->src[0];
|
|
if (ggml_nelements(node) != ggml_nelements(src)) {
|
|
// throw std::runtime_error("Unsupported VIEW case");
|
|
}
|
|
op_case = 0;
|
|
if (m_model_is_splitted && m_model_inputs.find(std::string(src->name)) != m_model_inputs.end()) {
|
|
op_case = 0;
|
|
}
|
|
}
|
|
{
|
|
auto * src = node->src[0];
|
|
if (ggml_nelements(node) != ggml_nelements(src)) {
|
|
// Case 4: select one slice on src dim1 (via view offset), keep src dim2 as output dim1.
|
|
// Typical pattern:
|
|
// src: ne=[N, M, K, 1], nb=[b0, b1, b2, b3]
|
|
// dst: ne=[N, K, 1, 1], nb=[b0, b2, b3, b3]
|
|
if (node->ne[0] == src->ne[0] && node->ne[1] == src->ne[2] && node->ne[2] == 1 &&
|
|
node->nb[0] == src->nb[0] && node->nb[1] == src->nb[2] && src->ne[1] > 1) {
|
|
op_case = 0;
|
|
break;
|
|
}
|
|
|
|
// General case 3: shape differs from source (one or more dims) and is handled as VIEW slicing.
|
|
int diff_count = 0;
|
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
|
if (node->ne[i] != src->ne[i]) {
|
|
diff_count++;
|
|
}
|
|
// if node ne[i] > src ne[i], case = 0
|
|
if (node->ne[i] > src->ne[i]) {
|
|
return 0;
|
|
}
|
|
}
|
|
if (diff_count >= 1) {
|
|
op_case = 0;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
return op_case;
|
|
}
|
|
|
|
std::optional<int> extract_layer_from_name(const std::string & name) {
|
|
size_t pos1 = name.find("_l");
|
|
if (pos1 == std::string::npos) {
|
|
return std::nullopt;
|
|
}
|
|
pos1 += 2;
|
|
size_t pos2 = name.find(' ', pos1);
|
|
if (pos2 == std::string::npos) {
|
|
pos2 = name.length();
|
|
}
|
|
std::string layer_str = name.substr(pos1, pos2 - pos1);
|
|
int layer = std::stoi(layer_str);
|
|
return layer;
|
|
}
|
|
|
|
std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static) {
|
|
ModelParams model_params;
|
|
ComputeParams compute_params;
|
|
auto get_attention_pattern_case = [](const ggml_tensor * node) -> int {
|
|
if (node == nullptr) {
|
|
return -1;
|
|
}
|
|
|
|
switch (node->op) {
|
|
case GGML_OP_FLASH_ATTN_EXT:
|
|
if (node->src[0] == nullptr || node->src[1] == nullptr || node->src[3] == nullptr) {
|
|
return -1;
|
|
}
|
|
switch (node->src[1]->op) {
|
|
case GGML_OP_PERMUTE:
|
|
// case 0: node op is FLASH_ATTN_EXT, src 1 not null & op is PERMUTE & the permuted tensor src is the view of cache k
|
|
if (node->src[1]->src[0] != nullptr && node->src[1]->src[0]->op == GGML_OP_VIEW) {
|
|
return 0;
|
|
}
|
|
break;
|
|
case GGML_OP_CPY:
|
|
// case 1: node op is FLASH_ATTN_EXT, src 1 not null & op is CPY & the copied tensor src is PERMUTE & the permuted tensor src is the view of cache k
|
|
if (node->src[1]->src[0] != nullptr && node->src[1]->src[0]->op == GGML_OP_PERMUTE &&
|
|
node->src[1]->src[0]->src[0] != nullptr && node->src[1]->src[0]->src[0]->op == GGML_OP_VIEW) {
|
|
return 1;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
case GGML_OP_SOFT_MAX:
|
|
// case 2: node op is SOFT_MAX, src 0 not null & op is MUL_MAT & the src 0 of MUL_MAT is PERMUTE & the permuted tensor src is the view of cache k
|
|
if (node->src[0] != nullptr && node->src[1] != nullptr && node->src[0]->op == GGML_OP_MUL_MAT &&
|
|
node->src[0]->src[0] != nullptr && node->src[0]->src[1] != nullptr &&
|
|
node->src[0]->src[0]->op == GGML_OP_PERMUTE && node->src[0]->src[0]->src[0] != nullptr &&
|
|
node->src[0]->src[0]->src[0]->op == GGML_OP_VIEW) {
|
|
return 2;
|
|
}
|
|
// case 3: node op is SOFT_MAX, src 0 not null & op is ADD & the src 0 of ADD is MUL_MAT & the src 0 of MUL_MAT is PERMUTE
|
|
if (node->src[0]->op == GGML_OP_ADD && node->src[0]->src[0] != nullptr &&
|
|
node->src[0]->src[0]->op == GGML_OP_MUL_MAT && node->src[0]->src[0]->src[0] != nullptr &&
|
|
node->src[0]->src[0]->src[0]->op == GGML_OP_PERMUTE) {
|
|
return 3;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return -1;
|
|
};
|
|
|
|
bool rope_seen = false;
|
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
auto * node = cgraph->nodes[i];
|
|
std::string name = std::string(node->name);
|
|
const int attention_pattern_case = get_attention_pattern_case(node);
|
|
if (attention_pattern_case != -1) {
|
|
ggml_tensor * cache_k_permute = nullptr;
|
|
ggml_tensor * mask = nullptr;
|
|
|
|
switch (attention_pattern_case) {
|
|
case 0:
|
|
cache_k_permute = node->src[1];
|
|
mask = node->src[3];
|
|
break;
|
|
case 1:
|
|
cache_k_permute = node->src[1]->src[0];
|
|
mask = node->src[3];
|
|
break;
|
|
case 2:
|
|
cache_k_permute = node->src[0]->src[0];
|
|
mask = node->src[1];
|
|
break;
|
|
case 3:
|
|
cache_k_permute = node->src[0]->src[0]->src[0];
|
|
mask = node->src[1];
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
assert(cache_k_permute != nullptr);
|
|
|
|
model_params.head_size = cache_k_permute->ne[0];
|
|
model_params.n_heads_kv = cache_k_permute->ne[2];
|
|
compute_params.input_len = node->src[0]->ne[1];
|
|
compute_params.token_len_per_seq = node->src[0]->ne[1];
|
|
|
|
auto * cache_k_view = cache_k_permute->src[0];
|
|
if (cache_k_view->op != GGML_OP_VIEW || mask == nullptr) {
|
|
continue;
|
|
}
|
|
|
|
ggml_tensor * cache_k = cache_k_view->src[0];
|
|
int layer = extract_layer_from_name(cache_k->name).value();
|
|
|
|
std::string mask_name(mask->name);
|
|
|
|
model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer);
|
|
if (mask_name.find("swa") != std::string::npos) {
|
|
model_params.swa_layers.push_back(layer);
|
|
model_params.ctx_per_seq_swa = cache_k->ne[1];
|
|
} else {
|
|
model_params.ctx_per_seq = cache_k->ne[1];
|
|
model_params.n_seq = cache_k->ne[2];
|
|
}
|
|
|
|
compute_params.n_seq_active = mask->ne[3];
|
|
auto seq_size = cache_k->ne[0] * cache_k->ne[1] * ggml_type_size(cache_k->type);
|
|
size_t offset;
|
|
memcpy(&offset, cache_k_view->op_params, sizeof(size_t));
|
|
compute_params.seq_active_start = offset / seq_size;
|
|
|
|
if (mask_name.find("swa") != std::string::npos) {
|
|
compute_params.attention_size_swa = mask->ne[0];
|
|
} else {
|
|
compute_params.attention_size = mask->ne[0];
|
|
}
|
|
if (is_static) {
|
|
compute_params.attention_size = model_params.ctx_per_seq;
|
|
compute_params.attention_size_swa = model_params.ctx_per_seq_swa;
|
|
compute_params.token_len_per_seq = 1;
|
|
}
|
|
}
|
|
|
|
if (node->op == GGML_OP_MUL_MAT && node->src[0]->op == GGML_OP_PERMUTE &&
|
|
node->src[0]->src[0]->op == GGML_OP_VIEW && is_kvcache(node->src[0]->view_src, node->view_src)) {
|
|
if (node->src[1]->op == GGML_OP_PERMUTE && node->src[1]->src[0]->op == GGML_OP_VIEW &&
|
|
node->src[1]->src[0]->src[0]->op == GGML_OP_ROPE) {
|
|
compute_params.attention_size = node->ne[0];
|
|
}
|
|
}
|
|
|
|
// if the node op is TRANSPOSE and its input is PERMUTE and the source of the PERMUTE is VIEW, then get the attention size with the TRANSPOSE node ne[0] (in case no GGML_OP_FLASH_ATTN_EXT)
|
|
if (node->op == GGML_OP_TRANSPOSE && node->src[0]->op == GGML_OP_PERMUTE &&
|
|
node->src[0]->src[0]->op == GGML_OP_VIEW) {
|
|
compute_params.attention_size = node->ne[0];
|
|
if (is_static) {
|
|
compute_params.attention_size = model_params.ctx_per_seq;
|
|
}
|
|
}
|
|
if (node->op == GGML_OP_ROPE) {
|
|
if (compute_params.token_len_per_seq == -1 && node->src[1] != nullptr) {
|
|
compute_params.token_len_per_seq = ggml_nelements(node->src[1]);
|
|
}
|
|
|
|
// When multiple ROPE ops in the graph disagree on op_params (e.g. gemma4's
|
|
// mixed SWA/non-SWA layers with different n_dims or freq_base), we cannot
|
|
// share a single precomputed rope_sin/rope_cos. Track divergence so the
|
|
// translator falls back to per-op make_sin_cos in that case.
|
|
static_assert(sizeof(model_params.rope_params) == sizeof(int32_t) * 15, "rope_params size");
|
|
if (!rope_seen) {
|
|
memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15);
|
|
rope_seen = true;
|
|
} else if (memcmp(model_params.rope_params, node->op_params, sizeof(int32_t) * 15) != 0) {
|
|
model_params.mixed_rope_params = true;
|
|
}
|
|
}
|
|
}
|
|
auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1];
|
|
compute_params.output_len = output_tensor->ne[1];
|
|
// for NPU, output_len is always 1 except for llama-perplexity
|
|
if (is_static && compute_params.output_len == 0) {
|
|
compute_params.output_len = 1;
|
|
}
|
|
model_params.ctx = model_params.ctx_per_seq * model_params.n_seq;
|
|
return {model_params, compute_params};
|
|
}
|
|
|
|
void GgmlOvDecoder::validate_cgraph() const {
|
|
if (m_model_params.n_seq > 1 && m_is_static == true) {
|
|
throw std::runtime_error("n_seq > 1 is not supported on NPU. Try setting -np 1.");
|
|
}
|
|
}
|
|
|
|
ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op,
|
|
const ggml_tensor * input,
|
|
int dynamic_dim_index) const {
|
|
if (m_naive) {
|
|
return input != nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)};
|
|
}
|
|
auto name = std::string(input->name);
|
|
ov::PartialShape input_shape;
|
|
|
|
if (is_inp_tok(input, op) || is_inp_pos(input, op)) {
|
|
// tokens or positions
|
|
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
|
|
input_shape = ov::PartialShape{1, 1, 1, len};
|
|
|
|
} else if (is_output_idx(input, op)) {
|
|
// output index
|
|
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};
|
|
|
|
} else if (is_inp_mask(input, op)) {
|
|
// mask
|
|
if (m_is_static) {
|
|
input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx};
|
|
} else if (m_is_stateful) {
|
|
input_shape = ov::PartialShape{1, 1, -1, -1};
|
|
} else {
|
|
input_shape = ov::PartialShape{-1, 1, -1, -1};
|
|
}
|
|
|
|
} else if (is_kvcache(input, op)) {
|
|
// kvcache
|
|
input_shape = ov::PartialShape{get_shape(input)};
|
|
if (!m_is_static) {
|
|
// do not fix ctx size to make llama-bench work across test params
|
|
input_shape[2] = -1;
|
|
}
|
|
if (is_stateful()) {
|
|
// Convert stateless KV cache layout [1, 1, seq, n_heads_kv * head_size]
|
|
// to stateful layout [1, seq, n_heads_kv, head_size].
|
|
assert(input_shape.size() == 4 && input_shape[0] == 1 && input_shape[1] == 1 &&
|
|
input_shape[2].is_dynamic() &&
|
|
input_shape[3] == (m_model_params.n_heads_kv * m_model_params.head_size));
|
|
input_shape = {input_shape[0], ov::Dimension::dynamic(), m_model_params.n_heads_kv,
|
|
m_model_params.head_size};
|
|
}
|
|
|
|
} else if (is_kv_idx(input, op)) {
|
|
// kv update index
|
|
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
|
|
input_shape = ov::PartialShape{1, 1, 1, len};
|
|
|
|
} else {
|
|
input_shape = ov::PartialShape{get_shape(input)};
|
|
}
|
|
if (dynamic_dim_index != -1 && m_model_is_splitted) {
|
|
input_shape[3 - dynamic_dim_index] = -1;
|
|
}
|
|
if (op->op == GGML_OP_SOFT_MAX && op->src[1] != nullptr && op->src[1]->op == GGML_OP_NONE &&
|
|
op->src[1]->flags & GGML_TENSOR_FLAG_INPUT && op->src[1] == input) {
|
|
// for softmax input mask, the shape is [1, 1, seq_active, seq_active], where seq_active is determined by the input active sequence length instead of the kv cache sequence length
|
|
input_shape[2] = -1;
|
|
input_shape[3] = -1;
|
|
}
|
|
return input_shape;
|
|
}
|
|
|
|
void GgmlOvDecoder::add_extra_inputs() {
|
|
// Extra inputs:
|
|
// 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned,
|
|
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
|
|
// 2. `n_seq_active` and `seq_active_start`, used in FLASH_ATTN_EXT to indicate the active sequences in the batch
|
|
|
|
auto create_1d_input = [this](const std::string & name, int64_t value) {
|
|
if (m_is_static) {
|
|
auto constant =
|
|
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{value});
|
|
constant->set_friendly_name(name);
|
|
m_model_extra_inputs[name] = constant;
|
|
} else {
|
|
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
|
|
param_node->set_friendly_name(name);
|
|
param_node->output(0).get_tensor().set_names({name});
|
|
m_model_extra_inputs[name] = param_node;
|
|
|
|
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
|
|
*tensor->data<int64_t>() = value;
|
|
m_model_extra_input_values[name] = tensor;
|
|
}
|
|
};
|
|
|
|
if (m_compute_params.attention_size != -1) {
|
|
create_1d_input("attention_size", m_compute_params.attention_size);
|
|
}
|
|
if (m_compute_params.attention_size_swa != -1) {
|
|
create_1d_input("attention_size_swa", m_compute_params.attention_size_swa);
|
|
}
|
|
create_1d_input("n_seq_active", m_compute_params.n_seq_active);
|
|
create_1d_input("seq_active_start", m_compute_params.seq_active_start);
|
|
create_1d_input("seq_active_end", m_compute_params.seq_active_start + m_compute_params.n_seq_active);
|
|
if (m_compute_params.token_len_per_seq != -1) {
|
|
create_1d_input("token_len_per_seq", m_compute_params.token_len_per_seq);
|
|
}
|
|
// create_1d_input("token_len", m_compute_params.token_len_per_seq * m_compute_params.n_seq_active);
|
|
}
|
|
|
|
bool GgmlOvDecoder::node_is_used_as_src(const int node_idx) {
|
|
ggml_tensor * node = m_cgraph->nodes[node_idx];
|
|
for (int i = node_idx; i < m_cgraph->n_nodes; i++) {
|
|
ggml_tensor * other_node = m_cgraph->nodes[i];
|
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
if (other_node->src[j] == node) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void GgmlOvDecoder::compute_model_inputs() {
|
|
m_model_inputs.clear();
|
|
m_inputs.clear();
|
|
for (int i = 0; i < m_cgraph->n_nodes; i++) {
|
|
ggml_tensor * node = m_cgraph->nodes[i];
|
|
// the node op is NONE means this node maybe as input of later nodes, we should add it to model inputs for this node.
|
|
if (node->op == GGML_OP_NONE && node_is_used_as_src(i)) {
|
|
std::string node_name(node->name);
|
|
if (m_model_weights.find(node_name) == m_model_weights.end()) {
|
|
m_inputs[node_name] = node;
|
|
auto param_node = std::make_shared<ov::op::v0::Parameter>(
|
|
get_ov_type(node), get_graph_input_shape(node, nullptr, m_node_dynamic_dims[node]));
|
|
param_node->set_friendly_name(node_name);
|
|
param_node->output(0).get_tensor().set_names({node_name});
|
|
m_model_inputs[node_name] = param_node;
|
|
}
|
|
continue;
|
|
}
|
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
auto * src = node->src[i];
|
|
if (src == nullptr) {
|
|
continue;
|
|
}
|
|
std::string src_name = std::string(src->name);
|
|
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
|
src_name = get_graph_input_ov_name(src, node);
|
|
}
|
|
if (m_model_weights.find(src_name) != m_model_weights.end()) {
|
|
continue;
|
|
}
|
|
|
|
bool is_intermediate_node = false;
|
|
for (const auto & node_info : m_node_info_list) {
|
|
if (node_info.node == src) {
|
|
is_intermediate_node = true;
|
|
break;
|
|
}
|
|
}
|
|
if (is_intermediate_node) {
|
|
continue;
|
|
}
|
|
if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
|
|
continue;
|
|
}
|
|
|
|
m_inputs[src_name] = src;
|
|
|
|
ggml_backend_buffer * buffer = src->buffer;
|
|
// GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
|
|
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
|
|
if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name);
|
|
it == m_model_params.kv_names.end()) {
|
|
m_model_params.kv_names.push_back(src_name);
|
|
}
|
|
}
|
|
// Resolve nested VIEW nodes by following src[0] until the first non-VIEW tensor.
|
|
while (src->op == GGML_OP_VIEW && src->src[0] != nullptr) {
|
|
src = src->src[0];
|
|
src_name = std::string(src->name);
|
|
}
|
|
m_inputs[src_name] = src;
|
|
ov::PartialShape param_shape = get_graph_input_shape(node, src, m_node_dynamic_dims[src]);
|
|
auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), param_shape);
|
|
param_node->set_friendly_name(src_name);
|
|
param_node->output(0).get_tensor().set_names({src_name});
|
|
m_model_inputs[src_name] = param_node;
|
|
}
|
|
}
|
|
}
|
|
|
|
void GgmlOvDecoder::compute_model_outputs() {
|
|
m_model_outputs.clear();
|
|
m_model_output_names.clear();
|
|
for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
|
|
auto * cur_node = m_cgraph->nodes[node_n];
|
|
// if the node op is NONE means this node is not used at all, we can skip it directly without adding to model outputs.
|
|
if (cur_node->op == GGML_OP_NONE || cur_node->op == GGML_OP_VIEW || cur_node->op == GGML_OP_RESHAPE) {
|
|
continue;
|
|
}
|
|
auto cur_node_use_count = m_cgraph->use_counts[ggml_hash_find(&m_cgraph->visited_hash_set, cur_node)];
|
|
if (cur_node_use_count == 0) {
|
|
// The output of SET_ROWS is the view_src tensor, which is updated in place. We should use the view_src name as the output name to make sure it can be correctly matched with the later ops that use the view_src.
|
|
if (cur_node != nullptr && cur_node->op == GGML_OP_SET_ROWS) {
|
|
cur_node = cur_node->view_src;
|
|
}
|
|
} else {
|
|
int input_use_count = 0;
|
|
for (int i = 0; i < m_cgraph->n_nodes; i++) {
|
|
ggml_tensor * node = m_cgraph->nodes[i];
|
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
if (node->src[j] != NULL && node->src[j] == cur_node) {
|
|
input_use_count++;
|
|
}
|
|
}
|
|
}
|
|
if (input_use_count == cur_node_use_count) {
|
|
cur_node = nullptr;
|
|
}
|
|
}
|
|
if (cur_node != nullptr) {
|
|
std::string node_output_name(cur_node->name);
|
|
m_model_outputs[node_output_name] = cur_node;
|
|
m_model_output_names.push_back(node_output_name);
|
|
}
|
|
}
|
|
}
|
|
|
|
const ggml_tensor * GgmlOvDecoder::get_tensor_used_op(const ggml_tensor * tensor) const {
|
|
if (tensor == nullptr) {
|
|
return nullptr;
|
|
}
|
|
for (int i = 0; i < m_cgraph->n_nodes; i++) {
|
|
const auto * node = m_cgraph->nodes[i];
|
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
if (node->src[j] == tensor) {
|
|
return node;
|
|
}
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name) const {
|
|
for (int i = 0; i < m_cgraph->n_nodes; i++) {
|
|
const auto * node = m_cgraph->nodes[i];
|
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
const auto * src = node->src[j];
|
|
if (src == nullptr) {
|
|
break;
|
|
}
|
|
if (std::string(src->name) == name) {
|
|
return src;
|
|
}
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const {
|
|
std::map<std::string, std::string> kv_param_res_names;
|
|
for (const auto & name : m_model_params.kv_names) {
|
|
kv_param_res_names[name] = name;
|
|
}
|
|
return kv_param_res_names;
|
|
}
|
|
|
|
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
|
|
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
|
auto * nodes = cgraph->nodes;
|
|
auto n_nodes = cgraph->n_nodes;
|
|
for (int node_i = 0; node_i < n_nodes; node_i++) {
|
|
auto * node = nodes[node_i];
|
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
auto * src = node->src[i];
|
|
if (src == nullptr) {
|
|
continue;
|
|
}
|
|
|
|
std::string src_name(src->name);
|
|
if (is_rope_freqs_weight(src, node)) {
|
|
src_name = "rope_freqs.weight";
|
|
}
|
|
if (!src->view_src) {
|
|
ggml_backend_buffer * buffer = src->buffer;
|
|
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) {
|
|
if (model_weights.find(src_name) == model_weights.end()) {
|
|
auto weight_node = create_weight_node(src, naive);
|
|
weight_node->set_friendly_name(src_name);
|
|
model_weights[src_name] = weight_node;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return model_weights;
|
|
}
|
|
|
|
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, bool naive) {
|
|
const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer);
|
|
|
|
// Check if we have a pre-built constant from the OpenVINO backend buffer
|
|
// This is set during ggml_backend_openvino_buffer_set_tensor
|
|
if (tensor->extra) {
|
|
OPENVINO_ASSERT(is_ov_buffer, "Unsupported weight tensor: " + std::string(tensor->name) +
|
|
" Possibly this is a cpu backend repacked quantized weights");
|
|
// Cast to our extra base type and check the type
|
|
auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);
|
|
|
|
if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) {
|
|
// F16/F32/BF16 weight with shared-memory constant
|
|
auto * weight_extra = static_cast<ggml_openvino_weight_extra *>(tensor->extra);
|
|
if (weight_extra->weight_node) {
|
|
// GGML_LOG_DEBUG("%s: using pre-built weight node for %s\n", __func__, tensor->name);
|
|
return weight_extra->weight_node;
|
|
}
|
|
} else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) {
|
|
// Quantized weight with pre-extracted data
|
|
auto * quant_extra = static_cast<ggml_openvino_quantized_weight_extra *>(tensor->extra);
|
|
if (quant_extra->weight_node) {
|
|
// GGML_LOG_DEBUG("%s: using pre-extracted quantized weight node for %s\n", __func__, tensor->name);
|
|
return quant_extra->weight_node;
|
|
}
|
|
}
|
|
}
|
|
|
|
// MUL_MAT_ID expert weights are 3D GGML tensors [k, m, n_expert].
|
|
// Keep the full reversed 4D shape when materializing non-quantized constants,
|
|
// otherwise the expert dimension is collapsed and later Gather/MatMul logic
|
|
// only sees a single expert slice.
|
|
if (!ggml_is_quantized(tensor->type) && (tensor->ne[2] > 1 || tensor->ne[3] > 1)) {
|
|
auto weight_tensor = ov::Tensor(get_ov_type(tensor), get_shape(tensor), tensor->data);
|
|
auto weight_node = std::make_shared<ov::op::v0::Constant>(weight_tensor);
|
|
weight_node->set_friendly_name(tensor->name);
|
|
return weight_node;
|
|
}
|
|
|
|
// There are three cases where we need to create a new weight node:
|
|
// 1. weights are in openvino_host_buffer. Weight loading to host buffer will not trigger backend_buffer_set_tensor
|
|
// 2. weights are in cpu/cpu_mapped buffer. On token_embd.weight goes to case 1 or 2, depending on whether mmap or direct_io is used
|
|
// 3. test-backend-ops. buffers in test-backend-ops does not set USAGE_WEIGHT so backend_buffer_set_tensor will not create weight node
|
|
|
|
// GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
|
|
static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0,
|
|
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1, GGML_TYPE_Q4_K,
|
|
GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
|
|
if (weight_types.find(tensor->type) == weight_types.end()) {
|
|
throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
|
|
ggml_type_name(tensor->type));
|
|
}
|
|
|
|
OvWeight ov_weight;
|
|
if (ggml_is_quantized(tensor->type)) {
|
|
auto use_bias = naive;
|
|
if (is_ov_buffer) {
|
|
// For quantized weights, copy raw data to a temp buffer first because
|
|
// process_weight_tensor reads from data and writes extracted results
|
|
// (weights/scales/zp) to output_base_ptr — they would overlap if both
|
|
// point to tensor->data.
|
|
size_t raw_size = ggml_nbytes(tensor);
|
|
std::vector<uint8_t> tmp(raw_size);
|
|
memcpy(tmp.data(), tensor->data, raw_size);
|
|
ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data, use_bias);
|
|
} else {
|
|
ov_weight = process_weight_tensor(tensor, tensor->data, nullptr, use_bias);
|
|
}
|
|
} else {
|
|
// For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
|
|
// process_weight_tensor will create an ov::Tensor wrapping tensor->data directly.
|
|
ov_weight = process_weight_tensor(tensor, tensor->data, tensor->data);
|
|
}
|
|
|
|
ov_weight.weight_node->set_friendly_name(tensor->name);
|
|
if (!is_ov_buffer) {
|
|
return ov_weight.weight_node;
|
|
}
|
|
|
|
ggml_openvino_extra_base * extra;
|
|
if (ov_weight.is_quantized()) {
|
|
extra = new ggml_openvino_quantized_weight_extra(std::move(ov_weight.weights), std::move(ov_weight.scales),
|
|
std::move(ov_weight.zp), ov_weight.weight_node);
|
|
} else {
|
|
extra = new ggml_openvino_weight_extra(std::move(ov_weight.weights), ov_weight.weight_node);
|
|
}
|
|
ggml_openvino_buffer_register_extra(tensor, extra);
|
|
|
|
return ov_weight.weight_node;
|
|
}
|
|
|
|
void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) {
|
|
std::ofstream file(filename);
|
|
if (!file.is_open()) {
|
|
std::cerr << "Failed to open file" << std::endl;
|
|
return;
|
|
}
|
|
|
|
file << "=== GRAPH ===\n";
|
|
|
|
// clang-format off
|
|
file << "n_nodes = " << cgraph->n_nodes << "\n";
|
|
file << " " << std::setw(3) << "nodes"
|
|
<< std::setw(15) << "shape"
|
|
<< std::setw(20) << "op"
|
|
<< std::setw(20) << "name"
|
|
<< std::setw(3) << " "
|
|
<< std::setw(62) << "stride"
|
|
<< std::setw(20) << "buffer_type"
|
|
<< "\n";
|
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
ggml_tensor * node = cgraph->nodes[i];
|
|
|
|
// Get buffer type name
|
|
const char * buf_name = "none";
|
|
ggml_backend_buffer_t buf = node->view_src ? node->view_src->buffer : node->buffer;
|
|
if (buf) {
|
|
buf_name = ggml_backend_buffer_name(buf);
|
|
}
|
|
|
|
file << " - " << std::setw(3) << i << ": [ "
|
|
<< std::setw(5) << node->ne[0] << ", "
|
|
<< std::setw(5) << node->ne[1] << ", "
|
|
<< std::setw(5) << node->ne[2] << ", "
|
|
<< std::setw(5) << node->ne[3] << "] "
|
|
<< std::left << std::setw(20) << ggml_op_name(node->op) << std::right << " "
|
|
<< std::left << std::setw(45) << node->name << std::right
|
|
<< std::setw(2) << "[ "
|
|
<< std::setw(0) << node->nb[0] << ", "
|
|
<< std::setw(5) << node->nb[1] << ", "
|
|
<< std::setw(5) << node->nb[2] << ", "
|
|
<< std::setw(5) << node->nb[3] << "] "
|
|
<< std::right << std::setw(15) << buf_name << std::right
|
|
<< "\n";
|
|
|
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
if (auto* src = node->src[i]) {
|
|
// Get buffer type name for source
|
|
const char * src_buf_name = "none";
|
|
ggml_backend_buffer_t src_buf = src->view_src ? src->view_src->buffer : src->buffer;
|
|
if (src_buf) {
|
|
src_buf_name = ggml_backend_buffer_name(src_buf);
|
|
}
|
|
|
|
file << std::setw(10) << " [ "
|
|
<< std::setw(5) << src->ne[0] << ", "
|
|
<< std::setw(5) << src->ne[1] << ", "
|
|
<< std::setw(5) << src->ne[2] << ", "
|
|
<< std::setw(5) << src->ne[3] << "] "
|
|
<< std::setw(12)
|
|
<< i << ": " << std::left << std::setw(12) << ggml_op_name(src->op) << std::right;
|
|
file << std::left << std::setw(30) << src->name << std::right
|
|
<< std::setw(16) << "[ "
|
|
<< std::setw(0) << src->nb[0] << ", "
|
|
<< std::setw(5) << src->nb[1] << ", "
|
|
<< std::setw(5) << src->nb[2] << ", "
|
|
<< std::setw(5) << src->nb[3] << "] "
|
|
<< std::right << std::setw(15) << src_buf_name << std::right
|
|
<< "\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
file << "n_leafs = " << cgraph->n_leafs << "\n";
|
|
for (int i = 0; i < cgraph->n_leafs; i++) {
|
|
ggml_tensor * node = cgraph->leafs[i];
|
|
|
|
// Get buffer type name for leaf
|
|
const char * leaf_buf_name = "none";
|
|
ggml_backend_buffer_t leaf_buf = node->view_src ? node->view_src->buffer : node->buffer;
|
|
if (leaf_buf) {
|
|
leaf_buf_name = ggml_backend_buffer_name(leaf_buf);
|
|
}
|
|
|
|
file << " - " << std::setw(3) << i << ": [ "
|
|
<< std::setw(5) << node->ne[0] << ", "
|
|
<< std::setw(5) << node->ne[1] << "] "
|
|
<< std::setw(8) << ggml_op_name(node->op) << " "
|
|
<< std::setw(16) << ggml_get_name(node)
|
|
<< std::setw(20) << leaf_buf_name << "\n";
|
|
}
|
|
// clang-format on
|
|
file << "========================================\n";
|
|
|
|
file.close();
|
|
}
|
|
|
|
void print_tensor_address_map(const ggml_cgraph * cgraph) {
|
|
std::map<void *, std::vector<std::string>> address_map;
|
|
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
|
auto * node = cgraph->nodes[node_n];
|
|
if (node->data) {
|
|
auto it = address_map.find(node->data);
|
|
if (it == address_map.end()) {
|
|
address_map[node->data] = std::vector<std::string>();
|
|
}
|
|
address_map[node->data].push_back(node->name);
|
|
}
|
|
}
|
|
for (const auto & pair : address_map) {
|
|
std::cout << "Address: " << pair.first << std::endl;
|
|
for (const auto & name : pair.second) {
|
|
std::cout << name << " ; ";
|
|
}
|
|
std::cout << std::endl << std::endl;
|
|
}
|
|
}
|
|
|
|
ov::Shape GgmlOvDecoder::get_shape(const ggml_tensor * tensor) {
|
|
std::vector<size_t> shape;
|
|
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
|
|
shape.push_back(static_cast<size_t>(tensor->ne[i]));
|
|
}
|
|
return shape;
|
|
}
|
|
|
|
std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor * tensor) {
|
|
std::vector<size_t> stride;
|
|
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
|
|
stride.push_back(static_cast<size_t>(tensor->nb[i]));
|
|
}
|
|
return stride;
|
|
}
|
|
|
|
ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor * tensor) {
|
|
switch (tensor->type) {
|
|
case GGML_TYPE_F64:
|
|
return ov::element::f64;
|
|
case GGML_TYPE_F32:
|
|
return ov::element::f32;
|
|
case GGML_TYPE_F16:
|
|
return ov::element::f16;
|
|
case GGML_TYPE_BF16:
|
|
return ov::element::bf16;
|
|
case GGML_TYPE_I8:
|
|
return ov::element::i8;
|
|
case GGML_TYPE_I16:
|
|
return ov::element::i16;
|
|
case GGML_TYPE_I32:
|
|
return ov::element::i32;
|
|
case GGML_TYPE_I64:
|
|
return ov::element::i64;
|
|
default:
|
|
return ov::element::dynamic;
|
|
}
|
|
}
|
|
|
|
ov::PartialShape GgmlOvDecoder::get_input_shape(int node_idx, const std::string & name) const {
|
|
return ov::PartialShape(get_shape(m_node_info_list[node_idx].node_inputs.at(name)));
|
|
}
|
|
|
|
std::vector<size_t> GgmlOvDecoder::get_input_stride(int node_idx, const std::string & name) const {
|
|
return get_stride(m_node_info_list[node_idx].node_inputs.at(name));
|
|
}
|
|
|
|
size_t GgmlOvDecoder::get_view_input_size(int node_idx, const std::string & name) const {
|
|
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
|
|
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
|
|
return it->second.size();
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
size_t GgmlOvDecoder::get_view_input_offset(int node_idx, const std::string & name, size_t view_index) const {
|
|
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
|
|
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
|
|
if (view_index < it->second.size()) {
|
|
return it->second[view_index].second->view_offs;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
size_t GgmlOvDecoder::get_view_input_src_offset(int node_idx, const std::string & name, size_t view_index) const {
|
|
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
|
|
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
|
|
if (view_index < it->second.size()) {
|
|
auto * view_tensor = it->second[view_index].second;
|
|
if (view_tensor && view_tensor->src[0]) {
|
|
return view_tensor->src[0]->view_offs;
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
std::vector<size_t> GgmlOvDecoder::get_view_input_stride(int node_idx,
|
|
const std::string & name,
|
|
size_t view_index) const {
|
|
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
|
|
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
|
|
if (view_index < it->second.size()) {
|
|
return get_stride(it->second[view_index].second);
|
|
}
|
|
}
|
|
return {};
|
|
}
|
|
|
|
std::vector<size_t> GgmlOvDecoder::get_view_input_src_stride(int node_idx,
|
|
const std::string & name,
|
|
size_t view_index) const {
|
|
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
|
|
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
|
|
if (view_index < it->second.size()) {
|
|
auto * view_tensor = it->second[view_index].second;
|
|
if (view_tensor && view_tensor->src[0]) {
|
|
return get_stride(view_tensor->src[0]);
|
|
}
|
|
}
|
|
}
|
|
return {};
|
|
}
|
|
|
|
ov::Shape GgmlOvDecoder::get_view_input_ggml_shape(int node_idx, const std::string & name, size_t view_index) const {
|
|
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
|
|
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
|
|
if (view_index < it->second.size()) {
|
|
return get_shape(it->second[view_index].second);
|
|
}
|
|
}
|
|
return {};
|
|
}
|
|
|
|
ov::Shape GgmlOvDecoder::get_view_input_src_ggml_shape(int node_idx,
|
|
const std::string & name,
|
|
size_t view_index) const {
|
|
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
|
|
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
|
|
if (view_index < it->second.size()) {
|
|
auto * view_tensor = it->second[view_index].second;
|
|
if (view_tensor && view_tensor->src[0]) {
|
|
return get_shape(view_tensor->src[0]);
|
|
}
|
|
}
|
|
}
|
|
return {};
|
|
}
|
|
|
|
ov::PartialShape GgmlOvDecoder::get_view_input_ov_shape(int node_idx,
|
|
const std::string & name,
|
|
size_t view_index) const {
|
|
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
|
|
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
|
|
if (view_index < it->second.size()) {
|
|
auto * tensor = it->second[view_index].second;
|
|
ov::PartialShape shape = ov::PartialShape{get_shape(tensor)};
|
|
|
|
// Check if this tensor has a dynamic dimension
|
|
auto dynamic_it = m_node_dynamic_dims.find(tensor);
|
|
if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) {
|
|
int dynamic_dim_index = dynamic_it->second;
|
|
// GGML uses reverse indexing, so convert to OpenVINO indexing
|
|
shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1;
|
|
}
|
|
|
|
return shape;
|
|
}
|
|
}
|
|
return {};
|
|
}
|
|
|
|
ov::PartialShape GgmlOvDecoder::get_view_input_src_ov_shape(int node_idx,
|
|
const std::string & name,
|
|
size_t view_index) const {
|
|
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
|
|
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
|
|
if (view_index < it->second.size()) {
|
|
auto * view_tensor = it->second[view_index].second;
|
|
if (view_tensor && view_tensor->src[0]) {
|
|
auto * src_tensor = view_tensor->src[0];
|
|
ov::PartialShape shape = ov::PartialShape{get_shape(src_tensor)};
|
|
|
|
// Check if this tensor has a dynamic dimension
|
|
auto dynamic_it = m_node_dynamic_dims.find(src_tensor);
|
|
if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) {
|
|
int dynamic_dim_index = dynamic_it->second;
|
|
// GGML uses reverse indexing, so convert to OpenVINO indexing
|
|
shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1;
|
|
}
|
|
|
|
return shape;
|
|
}
|
|
}
|
|
}
|
|
return {};
|
|
}
|
|
|
|
std::string GgmlOvDecoder::get_view_input_name(int node_idx, const std::string & name, size_t view_index) const {
|
|
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
|
|
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
|
|
if (view_index < it->second.size()) {
|
|
return it->second[view_index].second->name;
|
|
}
|
|
}
|
|
return "";
|
|
}
|
|
|
|
std::string GgmlOvDecoder::get_view_input_src_name(int node_idx, const std::string & name, size_t view_index) const {
|
|
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
|
|
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
|
|
if (view_index < it->second.size()) {
|
|
auto * view_tensor = it->second[view_index].second;
|
|
if (view_tensor && view_tensor->src[0]) {
|
|
return view_tensor->src[0]->name;
|
|
}
|
|
}
|
|
}
|
|
return "";
|
|
}
|
|
|
|
ov::element::Type GgmlOvDecoder::get_input_type(int node_idx, const std::string & name) const {
|
|
return get_ov_type(m_node_info_list[node_idx].node_inputs.at(name));
|
|
}
|
|
|
|
size_t GgmlOvDecoder::get_input_size() const {
|
|
return m_model_inputs.size();
|
|
}
|
|
|
|
size_t GgmlOvDecoder::get_input_size(int node_idx) const {
|
|
return m_node_info_list[node_idx].node_inputs_names.size();
|
|
}
|
|
|
|
std::vector<std::string> GgmlOvDecoder::get_input_names(int node_idx) const {
|
|
return m_node_info_list[node_idx].node_inputs_names;
|
|
}
|
|
|
|
ov::PartialShape GgmlOvDecoder::get_output_shape(int node_idx) const {
|
|
auto * ggml_tensor = m_node_info_list[node_idx].node_output;
|
|
return ov::PartialShape(get_shape(ggml_tensor));
|
|
}
|
|
|
|
ov::element::Type GgmlOvDecoder::get_output_type(const int node_idx) const {
|
|
return get_ov_type(m_node_info_list[node_idx].node);
|
|
}
|
|
|
|
std::vector<size_t> GgmlOvDecoder::get_output_stride(int node_idx) const {
|
|
auto * ggml_tensor = m_node_info_list[node_idx].node;
|
|
return get_stride(ggml_tensor);
|
|
}
|
|
|
|
std::vector<std::string> GgmlOvDecoder::get_output_names(int node_idx) const {
|
|
return {m_node_info_list[node_idx].node_output_name};
|
|
}
|
|
|
|
const std::string & GgmlOvDecoder::get_op_name() const {
|
|
static const std::string unknown_name = "UNKNOWN_OP_NAME";
|
|
return unknown_name;
|
|
}
|
|
|
|
int32_t GgmlOvDecoder::get_op_dynamic_dim(int node_idx) const {
|
|
auto it = m_node_dynamic_dims.find(m_node_info_list[node_idx].node);
|
|
if (it == m_node_dynamic_dims.end()) {
|
|
return -1;
|
|
}
|
|
return it->second;
|
|
}
|
|
|
|
const std::string & GgmlOvDecoder::get_op_name(int node_idx) const {
|
|
return m_node_info_list[node_idx].node_name;
|
|
}
|
|
|
|
int32_t * GgmlOvDecoder::get_input_op_params(int node_idx, const std::string & name) const {
|
|
return m_node_info_list[node_idx].node_inputs.at(name)->op_params;
|
|
}
|
|
|
|
int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const {
|
|
return m_node_info_list[node_idx].node->op_params;
|
|
}
|
|
|
|
size_t GgmlOvDecoder::get_output_op_offset(int node_idx) const {
|
|
return m_node_info_list[node_idx].node->view_offs;
|
|
}
|
|
|
|
void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const {
|
|
for (int node_idx = 0; node_idx < m_cgraph->n_nodes; node_idx++) {
|
|
if (m_cgraph->nodes[node_idx]->op == GGML_OP_NONE) {
|
|
continue;
|
|
}
|
|
node_visitor(std::make_shared<GgmlOvDecoder>(*this), node_idx);
|
|
}
|
|
}
|
|
|
|
std::string GgmlOvDecoder::compute_op_type(const ggml_tensor * node) {
|
|
static const std::map<ggml_op, std::string> ops = {
|
|
{GGML_OP_NONE, "GGML_OP_NONE" },
|
|
{GGML_OP_ACC, "GGML_OP_ACC" },
|
|
{GGML_OP_ADD, "GGML_OP_ADD" },
|
|
{GGML_OP_ADD1, "GGML_OP_ADD1" },
|
|
{GGML_OP_ADD_ID, "GGML_OP_ADD_ID" },
|
|
{GGML_OP_CONCAT, "GGML_OP_CONCAT" },
|
|
{GGML_OP_CONT, "GGML_OP_CONT" },
|
|
{GGML_OP_DIV, "GGML_OP_DIV" },
|
|
{GGML_OP_DUP, "GGML_OP_DUP" },
|
|
{GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" },
|
|
{GGML_OP_MUL, "GGML_OP_MUL" },
|
|
{GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" },
|
|
{GGML_OP_MUL_MAT_ID, "GGML_OP_MUL_MAT_ID" },
|
|
{GGML_OP_PERMUTE, "GGML_OP_PERMUTE" },
|
|
{GGML_OP_RESHAPE, "GGML_OP_RESHAPE" },
|
|
{GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" },
|
|
{GGML_OP_NORM, "GGML_OP_NORM" },
|
|
{GGML_OP_ROPE, "GGML_OP_ROPE" },
|
|
{GGML_OP_SCALE, "GGML_OP_SCALE" },
|
|
{GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" },
|
|
{GGML_OP_SUM_ROWS, "GGML_OP_SUM_ROWS" },
|
|
{GGML_OP_SUB, "GGML_OP_SUB" },
|
|
{GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE" },
|
|
{GGML_OP_VIEW, "GGML_OP_VIEW" },
|
|
{GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" },
|
|
{GGML_OP_CPY, "GGML_OP_CPY" },
|
|
{GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT" },
|
|
{GGML_OP_L2_NORM, "GGML_OP_L2_NORM" },
|
|
{GGML_OP_CLAMP, "GGML_OP_CLAMP" },
|
|
{GGML_OP_PAD, "GGML_OP_PAD" },
|
|
{GGML_OP_SSM_CONV, "GGML_OP_SSM_CONV" },
|
|
{GGML_OP_GATED_DELTA_NET, "GGML_OP_GATED_DELTA_NET"},
|
|
{GGML_OP_ARGSORT, "GGML_OP_ARGSORT" },
|
|
{GGML_OP_REPEAT, "GGML_OP_REPEAT" },
|
|
{GGML_OP_IM2COL, "GGML_OP_IM2COL" }
|
|
};
|
|
static const std::map<ggml_unary_op, std::string> unary_ops = {
|
|
{GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" },
|
|
{GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN" },
|
|
{GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG" },
|
|
{GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP" },
|
|
{GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH" },
|
|
{GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU" },
|
|
{GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU" },
|
|
{GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID" },
|
|
{GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU" },
|
|
{GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK" },
|
|
{GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU" },
|
|
{GGML_UNARY_OP_SOFTPLUS, "GGML_UNARY_OP_SOFTPLUS" },
|
|
{GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH" },
|
|
{GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
|
|
{GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP" },
|
|
{GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT" }
|
|
};
|
|
static const std::map<ggml_glu_op, std::string> glu_ops = {
|
|
{GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"},
|
|
{GGML_GLU_OP_GEGLU, "GGML_GLU_OP_GEGLU" },
|
|
{GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" }
|
|
};
|
|
|
|
switch (node->op) {
|
|
case GGML_OP_UNARY:
|
|
return unary_ops.at(ggml_get_unary_op(node));
|
|
case GGML_OP_GLU:
|
|
return glu_ops.at(ggml_get_glu_op(node));
|
|
default:
|
|
return ops.at(node->op);
|
|
}
|
|
static const std::string unknown_op = "UNKNOWN_GGML_OP";
|
|
return unknown_op;
|
|
}
|
|
|
|
const std::string & GgmlOvDecoder::get_op_type(int node_idx) const {
|
|
return m_node_info_list[node_idx].node_op_type;
|
|
}
|
|
|
|
const std::string & GgmlOvDecoder::get_op_type() const {
|
|
static const std::string unknown_op = "UNKNOWN_GGML_OP";
|
|
return unknown_op;
|
|
}
|
|
|
|
void GgmlOvDecoder::compute_node_dynamic_dims() {
|
|
auto visit_node = [&](auto && self, ggml_tensor * node) -> void {
|
|
if (!node) {
|
|
return;
|
|
}
|
|
|
|
if (node->op == GGML_OP_CPY) {
|
|
m_node_dynamic_dims[node] = -1;
|
|
}
|
|
|
|
if (m_node_dynamic_dims.count(node)) {
|
|
return;
|
|
}
|
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
ggml_tensor * src = node->src[i];
|
|
if (src == nullptr) {
|
|
continue;
|
|
}
|
|
struct ggml_tensor * root_src = nullptr;
|
|
// if (src->org_src) {
|
|
// root_src = src->org_src;
|
|
// }
|
|
if (root_src) {
|
|
if (is_inp_tok(root_src, node) || is_inp_pos(root_src, node) || is_output_idx(root_src, node)) {
|
|
m_node_dynamic_dims[root_src] = 0;
|
|
m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src];
|
|
continue;
|
|
}
|
|
self(self, root_src);
|
|
m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src];
|
|
} else {
|
|
if (is_inp_tok(src, node) || is_inp_pos(src, node) || is_output_idx(src, node)) {
|
|
m_node_dynamic_dims[src] = 0;
|
|
continue;
|
|
}
|
|
if (node->op == GGML_OP_VIEW && src->op == GGML_OP_NONE && !is_stateful() && !m_model_is_splitted) {
|
|
m_node_dynamic_dims[src] = 1;
|
|
continue;
|
|
}
|
|
self(self, src);
|
|
}
|
|
}
|
|
switch (node->op) {
|
|
case GGML_OP_NONE:
|
|
m_node_dynamic_dims[node] = -1;
|
|
break;
|
|
case GGML_OP_GET_ROWS:
|
|
m_node_dynamic_dims[node] = -1;
|
|
if (m_node_dynamic_dims[node->src[1]] != -1) {
|
|
auto dynamic_dim_idx = m_node_dynamic_dims[node->src[1]];
|
|
if (dynamic_dim_idx == 0) {
|
|
m_node_dynamic_dims[node] = 1;
|
|
} else {
|
|
auto dynamic_dim_stride = node->src[1]->nb[dynamic_dim_idx] / ggml_type_size(node->src[1]->type) *
|
|
ggml_type_size(node->src[0]->type);
|
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
|
if (dynamic_dim_stride == node->src[0]->nb[i]) {
|
|
m_node_dynamic_dims[node] = i;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
// OPENVINO_ASSERT(dynamic_dim_value == node->ne[m_node_dynamic_dims[node]],
|
|
// "Dynamic dim value mismatch for node: " + std::string(node->name) +
|
|
// " and its src[1]: " + std::string(node->src[1]->name));
|
|
}
|
|
break;
|
|
case GGML_OP_MUL:
|
|
case GGML_OP_MUL_MAT:
|
|
m_node_dynamic_dims[node] = -1;
|
|
if (m_node_dynamic_dims[node->src[0]] != -1) {
|
|
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
|
|
}
|
|
if (m_node_dynamic_dims[node->src[1]] != -1) {
|
|
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
|
|
}
|
|
break;
|
|
case GGML_OP_PERMUTE:
|
|
m_node_dynamic_dims[node] = -1;
|
|
if (m_node_dynamic_dims[node->src[0]] != -1) {
|
|
auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
|
|
// auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
|
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
|
if (node->op_params[i] == dynamic_dim_idx) {
|
|
m_node_dynamic_dims[node] = i;
|
|
break;
|
|
}
|
|
}
|
|
// OPENVINO_ASSERT(dynamic_dim_value == node->ne[m_node_dynamic_dims[node]],
|
|
// "Dynamic dim value mismatch for node: " + std::string(node->name) +
|
|
// " and its src[0]: " + std::string(node->src[0]->name));
|
|
}
|
|
break;
|
|
case GGML_OP_VIEW: {
|
|
// Use stride-based matching: the stride of a VIEW dimension directly
|
|
// encodes which source dimension it indexes into, so it uniquely
|
|
// identifies the dynamic dim even when two dims share the same size.
|
|
m_node_dynamic_dims[node] = -1;
|
|
if (m_node_dynamic_dims[node->src[0]] != -1) {
|
|
if (node->src[0]->op == GGML_OP_NONE) {
|
|
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
|
|
break;
|
|
}
|
|
auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
|
|
auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
|
|
auto dynamic_dim_stride =
|
|
node->src[0]->nb[dynamic_dim_idx] / ggml_type_size(node->src[0]->type) * ggml_type_size(node->type);
|
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
|
if (node->nb[i] == dynamic_dim_stride) {
|
|
m_node_dynamic_dims[node] = i;
|
|
break;
|
|
}
|
|
}
|
|
if (m_node_dynamic_dims[node] != -1 && dynamic_dim_value != node->ne[m_node_dynamic_dims[node]]) {
|
|
m_node_dynamic_dims[node] = -1;
|
|
// std::cout << "Warning: Dynamic dim value mismatch for node: " << node->name
|
|
// << " and its src[0]: " << node->src[0]->name << std::endl;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case GGML_OP_TRANSPOSE:
|
|
case GGML_OP_RESHAPE: {
|
|
// RESHAPE requires src[0] to be contiguous, so both src and result
|
|
// have standard compact strides: nb[i] = type_size * prod(ne[0..i-1]).
|
|
// Match src->nb[dynamic_dim] against result->nb[i] to find the output
|
|
// dimension whose flat-memory boundary aligns with the source dynamic
|
|
// boundary. This is unambiguous (result strides are strictly monotone)
|
|
// and handles merged-lower-dim cases that ne-value matching misses.
|
|
m_node_dynamic_dims[node] = -1;
|
|
if (m_node_dynamic_dims[node->src[0]] != -1) {
|
|
auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
|
|
auto dynamic_dim_stride = node->src[0]->nb[dynamic_dim_idx];
|
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
|
if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) {
|
|
m_node_dynamic_dims[node] = i;
|
|
break;
|
|
}
|
|
}
|
|
if (m_node_dynamic_dims[node] == -1) {
|
|
// std::cout << "Cannot determine dynamic dim for RESHAPE node: " << node->name << std::endl;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case GGML_OP_FLASH_ATTN_EXT: {
|
|
// Output shape is hard-coded in ggml_flash_attn_ext as:
|
|
// ne = { v->ne[0], q->ne[2], q->ne[1], q->ne[3] }
|
|
// i.e. output dim 0 <- v dim 0 (head_size, static)
|
|
// output dim 1 <- q dim 2 (n_heads, static)
|
|
// output dim 2 <- q dim 1 (n_tokens, potentially dynamic)
|
|
// output dim 3 <- q dim 3 (batch, static)
|
|
// Using the fixed q-dim -> output-dim mapping table.
|
|
// q is src[0]; the mapping from q's dynamic dim to the output dim is:
|
|
// q dim 1 -> output dim 2
|
|
// q dim 2 -> output dim 1
|
|
// q dim 3 -> output dim 3
|
|
// q dim 0 -> output dim 0 (head_size axis, unlikely to be dynamic)
|
|
constexpr int q_to_out[GGML_MAX_DIMS] = {0, 2, 1, 3};
|
|
m_node_dynamic_dims[node] = -1;
|
|
if (m_node_dynamic_dims[node->src[0]] != -1) {
|
|
auto q_dynamic_dim = m_node_dynamic_dims[node->src[0]];
|
|
m_node_dynamic_dims[node] = q_to_out[q_dynamic_dim];
|
|
}
|
|
break;
|
|
}
|
|
case GGML_OP_CONT:
|
|
m_node_dynamic_dims[node] = -1;
|
|
if (m_node_dynamic_dims[node->src[0]] != -1) {
|
|
auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
|
|
if (ggml_are_same_shape(node, node->src[0])) {
|
|
m_node_dynamic_dims[node] = dynamic_dim_idx;
|
|
} else {
|
|
size_t src_logical_nb[GGML_MAX_DIMS];
|
|
src_logical_nb[0] = ggml_type_size(node->src[0]->type);
|
|
src_logical_nb[1] = src_logical_nb[0] * (node->src[0]->ne[0] / ggml_blck_size(node->src[0]->type));
|
|
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
|
src_logical_nb[i] = src_logical_nb[i - 1] * node->src[0]->ne[i - 1];
|
|
}
|
|
|
|
auto dynamic_dim_stride = src_logical_nb[dynamic_dim_idx] / ggml_type_size(node->src[0]->type) *
|
|
ggml_type_size(node->type);
|
|
int matched_dim_count = 0;
|
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
|
if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) {
|
|
m_node_dynamic_dims[node] = i;
|
|
matched_dim_count++;
|
|
}
|
|
}
|
|
if (matched_dim_count != 1) {
|
|
m_node_dynamic_dims[node] = -1;
|
|
// std::cout << "Warning: Cannot determine dynamic dim for CONT node: " << node->name
|
|
// << " and its src[0]: " << node->src[0]->name << std::endl;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case GGML_OP_RMS_NORM:
|
|
case GGML_OP_NORM:
|
|
case GGML_OP_ADD:
|
|
case GGML_OP_GLU:
|
|
case GGML_OP_ROPE:
|
|
case GGML_OP_SCALE:
|
|
case GGML_OP_SOFT_MAX:
|
|
case GGML_OP_ARGSORT:
|
|
case GGML_OP_ADD_ID:
|
|
case GGML_OP_UNARY:
|
|
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
|
|
break;
|
|
case GGML_OP_MUL_MAT_ID:
|
|
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
|
|
break;
|
|
case GGML_OP_CPY:
|
|
case GGML_OP_SET_ROWS:
|
|
m_node_dynamic_dims[node] = -1;
|
|
break;
|
|
case GGML_OP_IM2COL: {
|
|
m_node_dynamic_dims[node] = -1;
|
|
if (m_node_dynamic_dims[node->src[1]] != -1) {
|
|
const bool is_2D = node->op_params[6] == 1;
|
|
const int src_dyn = m_node_dynamic_dims[node->src[1]];
|
|
if (is_2D) {
|
|
if (src_dyn == 0) {
|
|
m_node_dynamic_dims[node] = 1; // IW -> OW
|
|
} else if (src_dyn == 1) {
|
|
m_node_dynamic_dims[node] = 2; // IH -> OH
|
|
} else if (src_dyn == 3) {
|
|
m_node_dynamic_dims[node] = 3; // N -> N
|
|
}
|
|
} else {
|
|
if (src_dyn == 0) {
|
|
m_node_dynamic_dims[node] = 1; // IW -> OW
|
|
} else if (src_dyn == 2) {
|
|
m_node_dynamic_dims[node] = 2; // N -> N (1D: b->ne[2] is the batch/channel dim)
|
|
}
|
|
}
|
|
if (m_node_dynamic_dims[node] != -1) {
|
|
OPENVINO_ASSERT(node->src[1]->ne[src_dyn] == node->ne[m_node_dynamic_dims[node]],
|
|
"Dynamic dim value mismatch for IM2COL node: " + std::string(node->name) +
|
|
" and its src[1]: " + std::string(node->src[1]->name));
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
// std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl;
|
|
break;
|
|
}
|
|
};
|
|
|
|
for (int i = 0; i < m_cgraph->n_nodes; i++) {
|
|
ggml_tensor * node = m_cgraph->nodes[i];
|
|
visit_node(visit_node, node);
|
|
}
|
|
|
|
// print the nodes in m_cgraph name & shape with the dynamic dim (the dynamic dim is the dimension with -1 in m_node_dynamic_dims) for debugging
|
|
if (0) {
|
|
for (int i = 0; i < m_cgraph->n_nodes; i++) {
|
|
ggml_tensor * node = m_cgraph->nodes[i];
|
|
int dynamic_dim = m_node_dynamic_dims[node];
|
|
std::cout << "[" << i << "] " << "node_name: " << node->name << " op: " << ggml_op_name(node->op)
|
|
<< " shape: [";
|
|
for (int j = 0; j < 4; j++) {
|
|
if (j == dynamic_dim) {
|
|
std::cout << "*";
|
|
} else {
|
|
std::cout << node->ne[j];
|
|
}
|
|
if (j < 3) {
|
|
std::cout << ", ";
|
|
}
|
|
}
|
|
std::cout << "]" << std::endl;
|
|
// print the src name & shape with the dynamic dim for debugging
|
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
ggml_tensor * src = node->src[j];
|
|
if (src == nullptr) {
|
|
continue;
|
|
}
|
|
int src_dynamic_dim = m_node_dynamic_dims[src];
|
|
std::cout << " [" << j << "] src_name: " << src->name << " [";
|
|
for (int k = 0; k < 4; k++) {
|
|
if (k == src_dynamic_dim) {
|
|
std::cout << "*";
|
|
} else {
|
|
std::cout << src->ne[k];
|
|
}
|
|
if (k < 3) {
|
|
std::cout << ", ";
|
|
}
|
|
}
|
|
std::cout << "]" << std::endl;
|
|
}
|
|
std::cout << std::endl;
|
|
}
|
|
}
|
|
}
|