From 037397792a082d8e7f5cfc5a3457201bdcda7c01 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Mon, 22 Jun 2026 15:50:01 +0200 Subject: [PATCH] vulkan: split ggml-vulkan.cpp file --- ggml/src/ggml-vulkan/CMakeLists.txt | 14 +- ggml/src/ggml-vulkan/ggml-vulkan-backend.cpp | 1867 ++ ggml/src/ggml-vulkan/ggml-vulkan-buffers.cpp | 783 + ggml/src/ggml-vulkan/ggml-vulkan-common.h | 228 + ggml/src/ggml-vulkan/ggml-vulkan-debug.cpp | 1608 ++ .../ggml-vulkan/ggml-vulkan-flash-attn.cpp | 293 + ggml/src/ggml-vulkan/ggml-vulkan-graph.cpp | 1610 ++ ggml/src/ggml-vulkan/ggml-vulkan-instance.cpp | 1618 ++ ggml/src/ggml-vulkan/ggml-vulkan-matmul.cpp | 2207 ++ .../src/ggml-vulkan/ggml-vulkan-operators.cpp | 2893 +++ .../src/ggml-vulkan/ggml-vulkan-pipelines.cpp | 512 + .../ggml-vulkan/ggml-vulkan-push-constants.h | 872 + ggml/src/ggml-vulkan/ggml-vulkan-shaders.cpp | 2704 +++ ggml/src/ggml-vulkan/ggml-vulkan-types.h | 1341 ++ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 18393 ---------------- 15 files changed, 18549 insertions(+), 18394 deletions(-) create mode 100644 ggml/src/ggml-vulkan/ggml-vulkan-backend.cpp create mode 100644 ggml/src/ggml-vulkan/ggml-vulkan-buffers.cpp create mode 100644 ggml/src/ggml-vulkan/ggml-vulkan-common.h create mode 100644 ggml/src/ggml-vulkan/ggml-vulkan-debug.cpp create mode 100644 ggml/src/ggml-vulkan/ggml-vulkan-flash-attn.cpp create mode 100644 ggml/src/ggml-vulkan/ggml-vulkan-graph.cpp create mode 100644 ggml/src/ggml-vulkan/ggml-vulkan-instance.cpp create mode 100644 ggml/src/ggml-vulkan/ggml-vulkan-matmul.cpp create mode 100644 ggml/src/ggml-vulkan/ggml-vulkan-operators.cpp create mode 100644 ggml/src/ggml-vulkan/ggml-vulkan-pipelines.cpp create mode 100644 ggml/src/ggml-vulkan/ggml-vulkan-push-constants.h create mode 100644 ggml/src/ggml-vulkan/ggml-vulkan-shaders.cpp create mode 100644 ggml/src/ggml-vulkan/ggml-vulkan-types.h delete mode 100644 ggml/src/ggml-vulkan/ggml-vulkan.cpp diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 2d9e85794a..45eac2c387 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -60,7 +60,19 @@ if (Vulkan_FOUND) message(STATUS "Vulkan found") ggml_add_backend_library(ggml-vulkan - ggml-vulkan.cpp + ggml-vulkan-instance.cpp + ggml-vulkan-shaders.cpp + ggml-vulkan-buffers.cpp + ggml-vulkan-pipelines.cpp + ggml-vulkan-matmul.cpp + ggml-vulkan-flash-attn.cpp + ggml-vulkan-operators.cpp + ggml-vulkan-graph.cpp + ggml-vulkan-backend.cpp + ggml-vulkan-debug.cpp + ggml-vulkan-types.h + ggml-vulkan-push-constants.h + ggml-vulkan-common.h ../../include/ggml-vulkan.h ) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-backend.cpp b/ggml/src/ggml-vulkan/ggml-vulkan-backend.cpp new file mode 100644 index 0000000000..5e3169ec39 --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-backend.cpp @@ -0,0 +1,1867 @@ +#include "ggml-vulkan-common.h" + +bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { + return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name; +} + +void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { + VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()"); + ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; + ggml_vk_destroy_buffer(ctx->dev_buffer); + delete ctx; +} + +void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { + return vk_ptr_base; + + UNUSED(buffer); +} + +enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); + if (tensor->view_src != nullptr) { + GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); + } + return GGML_STATUS_SUCCESS; +} + +void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + VK_LOG_DEBUG("ggml_backend_vk_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")"); + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; + vk_buffer buf = buf_ctx->dev_buffer; + + if (size == 0) { + return; + } + + uint32_t val32 = (uint32_t)value * 0x01010101; + ggml_vk_buffer_memset(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, val32, size); +} + +void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; + vk_buffer buf = buf_ctx->dev_buffer; + + if (size == 0) { + return; + } + + ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); +} + +void ggml_backend_vk_buffer_set_tensor_2d(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, + size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) { + VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor_2d(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ", " << + n_copies << ", " << stride_tensor << ", " << stride_data << ")"); + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; + vk_buffer buf = buf_ctx->dev_buffer; + + if (size == 0) { + return; + } + + ggml_vk_buffer_write_2d(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, stride_data, stride_tensor, size, n_copies); +} + +void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; + + if (size == 0) { + return; + } + + vk_buffer buf = buf_ctx->dev_buffer; + + ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); +} + +void ggml_backend_vk_buffer_get_tensor_2d(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, + size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) { + VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor_2d(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ", " << + n_copies << ", " << stride_tensor << ", " << stride_data << ")"); + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; + + if (size == 0) { + return; + } + + vk_buffer buf = buf_ctx->dev_buffer; + + ggml_vk_buffer_read_2d(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, stride_tensor, stride_data, size, n_copies); +} + +bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + if (ggml_nbytes(src) == 0) { + return true; + } + + if (ggml_backend_buffer_is_vk(src->buffer)) { + ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + + vk_buffer src_buf = src_buf_ctx->dev_buffer; + vk_buffer dst_buf = dst_buf_ctx->dev_buffer; + + ggml_vk_buffer_copy(dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src)); + + return true; + } + return false; + + UNUSED(buffer); +} + +void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; + + ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); +} + +const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) { + ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context; + + return ctx->name.c_str(); +} + +ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")"); + ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; + + vk_buffer dev_buffer = nullptr; + try { + dev_buffer = ggml_vk_create_buffer_device(ctx->device, size); + } catch (const vk::SystemError& e) { + return nullptr; + } + + ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->device, std::move(dev_buffer), ctx->name); + + return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size); +} + +size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; + return ctx->device->properties.limits.minStorageBufferOffsetAlignment; +} + +size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; + return ctx->device->suballocation_block_size; +} + +size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + return ggml_nbytes(tensor); + + UNUSED(buft); +} + +ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { + ggml_vk_instance_init(); + + VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")"); + + vk_device dev = ggml_vk_get_device(dev_num); + + return &dev->buffer_type; +} + +static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return GGML_VK_NAME "_Host"; + + UNUSED(buft); +} + +static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); + ggml_vk_host_free(vk_instance.devices[0], buffer->context); +} + +static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")"); + + size += 32; // Behave like the CPU buffer type + void * ptr = nullptr; + try { + ptr = ggml_vk_host_malloc(vk_instance.devices[0], size); + } catch (vk::SystemError& e) { + GGML_LOG_WARN("ggml_vulkan: Failed to allocate pinned memory (%s)\n", e.what()); + // fallback to cpu buffer + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer; + + return buffer; + + UNUSED(buft); +} + +static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return vk_instance.devices[0]->properties.limits.minMemoryMapAlignment; + + UNUSED(buft); +} + +static size_t ggml_backend_vk_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + return vk_instance.devices[0]->suballocation_block_size; + + UNUSED(buft); +} + +ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_vk_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_vk_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_vk_host_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), 0), + /* .context = */ nullptr, + }; + + // Make sure device 0 is initialized + ggml_vk_instance_init(); + ggml_vk_get_device(0); + + return &ggml_backend_vk_buffer_type_host; +} + +static const char * ggml_backend_vk_name(ggml_backend_t backend) { + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + + return ctx->name.c_str(); +} + +void ggml_backend_vk_free(ggml_backend_t backend) { + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")"); + + ggml_vk_cleanup(ctx); + + delete ctx; + delete backend; +} + +static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) { + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + + return &ctx->device->buffer_type; +} + +static void ggml_backend_vk_set_tensor_2d_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, + size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) { + VK_LOG_DEBUG("ggml_backend_vk_set_tensor_2d_async(" << size << ", " << n_copies << ")"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); + + if (size == 0) { + return; + } + + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; + + vk_context cpy_ctx; + + if (ctx->device->async_use_transfer_queue) { + if (ctx->transfer_ctx.expired()) { + cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool); + ctx->transfer_ctx = cpy_ctx; + ggml_vk_ctx_begin(ctx->device, cpy_ctx); + } else { + cpy_ctx = ctx->transfer_ctx.lock(); + } + } else { + cpy_ctx = ggml_vk_get_compute_ctx(ctx); + } + + vk_buffer buf = buf_ctx->dev_buffer; + + auto dst_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset; + + bool ret = ggml_vk_buffer_write_2d_async(cpy_ctx, buf, dst_offset, data, stride_data, stride_tensor, size, n_copies); + + if (!ret) { + const size_t staging_size = size * n_copies; + ggml_vk_ensure_sync_staging_buffer(ctx, staging_size); + ggml_vk_sync_buffers(nullptr, cpy_ctx); + + std::vector slices(1); + if (size == stride_tensor) { + slices[0].srcOffset = 0; + slices[0].dstOffset = dst_offset; + slices[0].size = staging_size; + } else { + slices.resize(n_copies); + for (size_t i = 0; i < n_copies; i++) { + slices[i].srcOffset = i * size; + slices[i].dstOffset = dst_offset + i * stride_tensor; + slices[i].size = size; + } + } + + cpy_ctx->s->buffer->buf.copyBuffer(ctx->sync_staging->buffer, buf->buffer, slices); + + if (size == stride_data) { + deferred_memcpy(ctx->sync_staging->ptr, data, staging_size, &cpy_ctx->in_memcpys); + } else { + for (size_t i = 0; i < n_copies; i++) { + deferred_memcpy((uint8_t *)ctx->sync_staging->ptr + i * size, (const uint8_t *)data + i * stride_data, size, &cpy_ctx->in_memcpys); + } + } + ggml_vk_synchronize(ctx); + } +} + +static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")"); + ggml_backend_vk_set_tensor_2d_async(backend, tensor, data, offset, size, 1, size, size); +} + +static void ggml_backend_vk_get_tensor_2d_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, + size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) { + VK_LOG_DEBUG("ggml_backend_vk_get_tensor_2d_async(" << size << ", " << n_copies << ")"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); + + if (size == 0) { + return; + } + + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; + + vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); + + vk_buffer buf = buf_ctx->dev_buffer; + + auto src_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset; + bool ret = ggml_vk_buffer_read_2d_async(compute_ctx, buf, src_offset, data, stride_tensor, stride_data, size, n_copies); + + if (!ret) { + const size_t staging_size = size * n_copies; + ggml_vk_ensure_sync_staging_buffer(ctx, staging_size); + ggml_vk_sync_buffers(nullptr, compute_ctx); + + std::vector slices(1); + if (size == stride_tensor) { + slices[0].srcOffset = src_offset; + slices[0].dstOffset = 0; + slices[0].size = staging_size; + } else { + slices.resize(n_copies); + for (size_t i = 0; i < n_copies; i++) { + slices[i].srcOffset = src_offset + i * stride_tensor; + slices[i].dstOffset = i * size; + slices[i].size = size; + } + } + + compute_ctx->s->buffer->buf.copyBuffer(buf->buffer, ctx->sync_staging->buffer, slices); + + if (size == stride_data) { + deferred_memcpy(data, ctx->sync_staging->ptr, staging_size, &compute_ctx->out_memcpys); + } else { + for (size_t i = 0; i < n_copies; i++) { + deferred_memcpy((uint8_t *)data + i * stride_data, (const uint8_t *)ctx->sync_staging->ptr + i * size, size, &compute_ctx->out_memcpys); + } + } + ggml_vk_synchronize(ctx); + } +} + +static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")"); + ggml_backend_vk_get_tensor_2d_async(backend, tensor, data, offset, size, 1, size, size); +} + +static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { + VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_nbytes(src) << ")"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context; + + // Skip zero-size tensors + if (ggml_nbytes(src) == 0) { + return true; + } + + if (dst->buffer->buft != ggml_backend_vk_get_default_buffer_type(backend_dst)) { + return false; + } + + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + vk_buffer dst_buf = dst_buf_ctx->dev_buffer; + + if (ggml_backend_buffer_is_vk(src->buffer)) { + ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; + + // Async copy only works within the same device + if (src_buf_ctx->dev_buffer->device != dst_buf->device) { + return false; + } + + vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); + + ggml_vk_buffer_copy_async(compute_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, + src_buf_ctx->dev_buffer, vk_tensor_offset(src) + src->view_offs, + ggml_nbytes(src)); + return true; + } + + if (ggml_backend_buffer_is_host(src->buffer)) { + vk_buffer pinned_buf = nullptr; + size_t pinned_offset = 0; + ggml_vk_host_get(ctx->device, src->data, pinned_buf, pinned_offset); + if (pinned_buf == nullptr) { + return false; + } + + vk_context cpy_ctx; + if (ctx->device->async_use_transfer_queue) { + if (ctx->transfer_ctx.expired()) { + cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool); + ctx->transfer_ctx = cpy_ctx; + ggml_vk_ctx_begin(ctx->device, cpy_ctx); + } else { + cpy_ctx = ctx->transfer_ctx.lock(); + } + } else { + cpy_ctx = ggml_vk_get_compute_ctx(ctx); + } + + return ggml_vk_buffer_write_async(cpy_ctx, dst_buf, + vk_tensor_offset(dst) + dst->view_offs, + src->data, ggml_nbytes(src)); + } + + GGML_UNUSED(backend_src); + return false; +} + +static void ggml_backend_vk_synchronize(ggml_backend_t backend) { + VK_LOG_DEBUG("ggml_backend_vk_synchronize()"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + + ggml_vk_synchronize(ctx); + + ggml_vk_graph_cleanup(ctx); +} + +static int32_t find_first_set(uint32_t x) { + int32_t ret = 0; + if (!x) { + return -1; + } + while (!(x & 1)) { + x >>= 1; + ret++; + } + return ret; +} + +static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + + if (vk_instance.debug_utils_support) { + vk::DebugUtilsLabelEXT dul = {}; + dul.pLabelName = "ggml_backend_vk_graph_compute"; + dul.color = std::array{1.0f, 1.0f, 1.0f, 1.0f}; + vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT(ctx->device->compute_queue.queue, reinterpret_cast(&dul)); + } + + ctx->prealloc_size_add_rms_partials_offset = 0; + ctx->do_add_rms_partials = false; + ctx->do_add_rms_partials_offset_calculation = false; + + int last_node = cgraph->n_nodes - 1; + + // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly + while (last_node > 0 && (ggml_vk_is_empty(cgraph->nodes[last_node]) || ((cgraph->nodes[last_node]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0))) { + last_node -= 1; + } + + // Reserve tensor context space for all nodes + ctx->tensor_ctxs.resize(cgraph->n_nodes); + + bool first_node_in_batch = true; // true if next node will be first node in a batch + int submit_node_idx = 0; // index to first node in a batch + + ggml_vk_submit_transfer_ctx(ctx); + + vk_context compute_ctx; + if (vk_perf_logger_enabled) { + // allocate/resize the query pool + if (ctx->num_queries < cgraph->n_nodes + 1) { + if (ctx->query_pool) { + ctx->device->device.destroyQueryPool(ctx->query_pool); + } + vk::QueryPoolCreateInfo query_create_info; + query_create_info.queryType = vk::QueryType::eTimestamp; + query_create_info.queryCount = cgraph->n_nodes + 100; + ctx->query_pool = ctx->device->device.createQueryPool(query_create_info); + ctx->num_queries = query_create_info.queryCount; + ctx->query_fusion_names.resize(ctx->num_queries); + ctx->query_fusion_node_count.resize(ctx->num_queries); + ctx->query_nodes.resize(ctx->num_queries); + ctx->query_node_idx.resize(ctx->num_queries); + } + + ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1); + std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr); + std::fill(ctx->query_fusion_node_count.begin(), ctx->query_fusion_node_count.end(), 0); + std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr); + std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0); + + GGML_ASSERT(ctx->compute_ctx.expired()); + compute_ctx = ggml_vk_get_compute_ctx(ctx); + ctx->query_idx = 0; + compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + ggml_vk_sync_buffers(ctx, compute_ctx); + } + + ctx->prealloc_y_last_pipeline_used = nullptr; + ctx->prealloc_y_last_tensor_used = nullptr; + ctx->prealloc_y_last_decode_vector_staging = false; + + if (ctx->prealloc_size_add_rms_partials) { + ggml_vk_preallocate_buffers(ctx, nullptr); + compute_ctx = ggml_vk_get_compute_ctx(ctx); + // initialize partial sums to zero. + ggml_vk_buffer_memset_async(compute_ctx, ctx->prealloc_add_rms_partials, 0, 0, ctx->prealloc_size_add_rms_partials); + ggml_vk_sync_buffers(ctx, compute_ctx); + } + + // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution. + // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB + // (and scaled down based on model size, so smaller models submit earlier). + // Also submit at least every 100 nodes, in case there are workloads without as much matmul. + int nodes_per_submit = 100; + int submitted_nodes = 0; + int submit_count = 0; + uint64_t mul_mat_bytes = 0; + uint64_t total_mul_mat_bytes = 0; + uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), ctx->last_total_mul_mat_bytes / 40u); + for (int i = 0; i < cgraph->n_nodes; i++) { + if (first_node_in_batch) { + submit_node_idx = i; + } + + if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) { + auto bytes = ggml_nbytes(cgraph->nodes[i]->src[0]); + mul_mat_bytes += bytes; + total_mul_mat_bytes += bytes; + } + + // op_srcs_fused_elementwise indicates whether an op's srcs all contribute to + // the fused result in an elementwise-way. This affects whether the memory for + // the src is allowed to overlap the memory for the destination. + // The array is sized to handle the largest fusion (asserted later). + bool op_srcs_fused_elementwise[12]; + + ctx->fused_topk_moe_mode = TOPK_MOE_COUNT; + ctx->fused_topk_moe_scale = false; + const char *fusion_string {}; + if (!ctx->device->disable_fusion) { + uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i); + if (num_adds) { + ctx->num_additional_fused_ops = num_adds - 1; + fusion_string = "MULTI_ADD"; + std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, true); + } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_ADD })) { + ctx->num_additional_fused_ops = 2; + fusion_string = "MUL_MAT_ADD_ADD"; + op_srcs_fused_elementwise[0] = false; + op_srcs_fused_elementwise[1] = true; + op_srcs_fused_elementwise[2] = true; + } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) { + ctx->num_additional_fused_ops = 1; + fusion_string = "MUL_MAT_ADD"; + op_srcs_fused_elementwise[0] = false; + op_srcs_fused_elementwise[1] = true; + } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL })) { + ctx->num_additional_fused_ops = 2; + fusion_string = "MUL_MAT_ID_ADD_ID_MUL"; + op_srcs_fused_elementwise[0] = false; + op_srcs_fused_elementwise[1] = true; + op_srcs_fused_elementwise[2] = true; + } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) { + ctx->num_additional_fused_ops = 1; + fusion_string = "MUL_MAT_ID_ADD_ID"; + op_srcs_fused_elementwise[0] = false; + op_srcs_fused_elementwise[1] = true; + } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_MUL })) { + ctx->num_additional_fused_ops = 1; + fusion_string = "MUL_MAT_ID_MUL"; + op_srcs_fused_elementwise[0] = false; + op_srcs_fused_elementwise[1] = true; + } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 4 }) && + ggml_check_edges(cgraph, i, rms_norm_mul_rope_view_set_rows_edges) && + ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i) && + ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i + 2)) { + ctx->num_additional_fused_ops = 4; + fusion_string = "RMS_NORM_MUL_ROPE_VIEW_SET_ROWS"; + op_srcs_fused_elementwise[0] = false; + op_srcs_fused_elementwise[1] = false; + op_srcs_fused_elementwise[2] = false; + op_srcs_fused_elementwise[3] = false; + op_srcs_fused_elementwise[4] = false; + } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE })&& + ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i)) { + ctx->num_additional_fused_ops = 2; + fusion_string = "RMS_NORM_MUL_ROPE"; + // rope is approximately elementwise - whole rows are done by a single workgroup and it's row-wise + op_srcs_fused_elementwise[0] = false; + op_srcs_fused_elementwise[1] = true; + op_srcs_fused_elementwise[2] = true; + } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + ctx->num_additional_fused_ops = 1; + fusion_string = "RMS_NORM_MUL"; + // rms_norm is not elementwise, but whole rows must be consumed and the scale factor computed before + // they are overwritten, and one workgroup per row. So close enough. + op_srcs_fused_elementwise[0] = true; + op_srcs_fused_elementwise[1] = true; + } else if (ggml_vk_can_fuse_ssm_conv(ctx, cgraph, i, 2)) { + ctx->num_additional_fused_ops = 2; + fusion_string = "SSM_CONV_BIAS_SILU"; + // ssm_conv reads multiple input tokens per output, so it's not elementwise w.r.t. its srcs. + // The downstream add and silu are elementwise on the conv output. + op_srcs_fused_elementwise[0] = false; + op_srcs_fused_elementwise[1] = true; + op_srcs_fused_elementwise[2] = true; + } else if (ggml_vk_can_fuse_ssm_conv(ctx, cgraph, i, 1)) { + ctx->num_additional_fused_ops = 1; + fusion_string = "SSM_CONV_SILU"; + op_srcs_fused_elementwise[0] = false; + op_srcs_fused_elementwise[1] = true; + } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) && + ggml_check_edges(cgraph, i, rope_view_set_rows_edges) && + ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) { + ctx->num_additional_fused_ops = 2; + fusion_string = "ROPE_VIEW_SET_ROWS"; + op_srcs_fused_elementwise[0] = false; + op_srcs_fused_elementwise[1] = false; + op_srcs_fused_elementwise[2] = false; + } else if (ggml_vk_can_fuse_snake(ctx, cgraph, i)) { + ctx->num_additional_fused_ops = 4; + fusion_string = "SNAKE"; + // elementwise=true: snake.comp is safe under exact aliasing because each + // thread reads data_x[idx] into a register before writing data_d[idx] + // with a data dependency on that register. The overlap check still + // rejects partial overlaps (different base or size). + std::fill_n(op_srcs_fused_elementwise, 5, true); + } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) && + ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) && + ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) { + ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1; + // view of argsort writes to memory + ctx->fused_ops_write_mask |= 1 << 3; + ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX_NORM; + fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM"; + std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false); + } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_sigmoid_norm_bias, { i + 4, i + 10 }) && + ggml_check_edges(cgraph, i, topk_moe_sigmoid_norm_bias_edges) && + ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_SIGMOID_NORM_BIAS)) { + ctx->num_additional_fused_ops = topk_moe_sigmoid_norm_bias.size() - 1; + // view of argsort writes to memory + ctx->fused_ops_write_mask |= 1 << 4; + ctx->fused_topk_moe_mode = TOPK_MOE_SIGMOID_NORM_BIAS; + fusion_string = "TOPK_MOE_SIGMOID_NORM_BIAS"; + std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false); + } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) && + ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) && + ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) { + ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1; + // view of argsort writes to memory + ctx->fused_ops_write_mask |= 1 << 3; + ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX; + fusion_string = "TOPK_MOE_EARLY_SOFTMAX"; + std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false); + } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) && + ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) && + ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) { + ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1; + // view of argsort writes to memory + ctx->fused_ops_write_mask |= 1 << 1; + ctx->fused_topk_moe_mode = TOPK_MOE_LATE_SOFTMAX; + fusion_string = "TOPK_MOE_LATE_SOFTMAX"; + std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false); + } + if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) { + // Look for an additional scale op to fuse - occurs in deepseek2 and nemotron3 nano. + if (ggml_can_fuse_subgraph(cgraph, i + ctx->num_additional_fused_ops - 1, { GGML_OP_DIV, GGML_OP_RESHAPE, GGML_OP_SCALE }, { i + ctx->num_additional_fused_ops + 1 }) || + ggml_can_fuse_subgraph(cgraph, i + ctx->num_additional_fused_ops, { GGML_OP_GET_ROWS, GGML_OP_SCALE }, { i + ctx->num_additional_fused_ops + 1 })) { + ctx->fused_topk_moe_scale = true; + ctx->num_additional_fused_ops++; + op_srcs_fused_elementwise[ctx->num_additional_fused_ops] = false; + } + } + } + GGML_ASSERT(ctx->num_additional_fused_ops < (int)(sizeof(op_srcs_fused_elementwise) / sizeof(op_srcs_fused_elementwise[0]))); + ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops; + + // Check whether fusion would overwrite src operands while they're still in use. + // If so, disable fusion. + if (ctx->num_additional_fused_ops) { + // There are up to two output nodes - topk_moe has two. + uint32_t bits = ctx->fused_ops_write_mask & ~(1 << ctx->num_additional_fused_ops); + ggml_tensor *output_nodes[2] {}; + output_nodes[0] = cgraph->nodes[i + ctx->num_additional_fused_ops]; + if (bits) { + int output_idx = find_first_set(bits); + GGML_ASSERT(bits == (1u << output_idx)); + output_nodes[1] = cgraph->nodes[i + output_idx]; + } + + bool need_disable = false; + + // topk_moe often overwrites the source, but for a given row all the src values are + // loaded before anything is stored. If there's only one row, this is safe, so treat + // this as a special case. + bool is_topk_moe_single_row = ctx->fused_topk_moe_mode != TOPK_MOE_COUNT && + ggml_nrows(cgraph->nodes[i]->src[0]) == 1; + + if (!is_topk_moe_single_row) { + for (int j = 0; j < 2; ++j) { + ggml_tensor *dst = output_nodes[j]; + if (!dst) { + continue; + } + // Loop over all srcs of all nodes in the fusion. If the src overlaps + // the destination and the src is not an intermediate node that's being + // elided, then disable fusion. + for (int k = 0; k <= ctx->num_additional_fused_ops; ++k) { + for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) { + ggml_tensor *src = cgraph->nodes[i + k]->src[s]; + if (!src || src->op == GGML_OP_NONE) { + continue; + } + if (ggml_vk_tensors_overlap(src, dst, op_srcs_fused_elementwise[k])) { + bool found = false; + for (int n = 0; n < k; ++n) { + if (cgraph->nodes[i + n] == src) { + found = true; + break; + } + } + if (!found) { + need_disable = true; + } + } + } + } + } + } + if (need_disable) { + ctx->num_additional_fused_ops = 0; + ctx->fused_ops_write_mask = 1; + ctx->fused_topk_moe_mode = TOPK_MOE_COUNT; + ctx->fused_topk_moe_scale = false; + } + } + + // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining) + bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5; + bool submit = (submitted_nodes >= nodes_per_submit) || + (mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) || + (i + ctx->num_additional_fused_ops >= last_node) || + (almost_ready && !ctx->almost_ready_fence_pending); + + bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit); + + if (vk_perf_logger_enabled && enqueued) { + compute_ctx = ggml_vk_get_compute_ctx(ctx); + if (!vk_perf_logger_concurrent) { + // track a single node/fusion for the current query + ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i]; + ctx->query_fusion_names[ctx->query_idx] = fusion_string; + compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + ggml_vk_sync_buffers(ctx, compute_ctx); + } else { + // track a fusion string and number of fused ops for the current node_idx + ctx->query_fusion_names[i] = fusion_string; + ctx->query_fusion_node_count[i] = ctx->num_additional_fused_ops; + } + } + + if (enqueued) { + ++submitted_nodes; + +#ifndef GGML_VULKAN_CHECK_RESULTS + if (first_node_in_batch) { + first_node_in_batch = false; + } +#endif + } + + if (submit && enqueued) { + first_node_in_batch = true; + submitted_nodes = 0; + mul_mat_bytes = 0; + if (submit_count < 3) { + mul_mat_bytes_per_submit *= 2; + } + submit_count++; + } + i += ctx->num_additional_fused_ops; + ctx->num_additional_fused_ops = 0; + ctx->fused_ops_write_mask = 0; + } + + ctx->last_total_mul_mat_bytes = total_mul_mat_bytes; + + if (vk_perf_logger_enabled) { + // End the command buffer and submit/wait + GGML_ASSERT(!ctx->compute_ctx.expired()); + compute_ctx = ctx->compute_ctx.lock(); + ggml_vk_ctx_end(compute_ctx); + + ggml_vk_submit(compute_ctx, ctx->device->fence); + VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences"); + ctx->device->device.resetFences({ ctx->device->fence }); + ctx->compute_ctx.reset(); + + // Get the results and pass them to the logger + std::vector timestamps(cgraph->n_nodes + 1); + VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results"); + if (!vk_perf_logger_concurrent) { + // Log each op separately + for (int i = 1; i < ctx->query_idx; i++) { + auto node = ctx->query_nodes[i]; + auto name = ctx->query_fusion_names[i]; + ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); + } + } else { + // Log each group of nodes + int prev_node_idx = 0; + for (int i = 1; i < ctx->query_idx; i++) { + auto cur_node_idx = ctx->query_node_idx[i]; + std::vector nodes; + std::vector names; + for (int node_idx = prev_node_idx; node_idx < cur_node_idx; ++node_idx) { + if (ggml_op_is_empty(cgraph->nodes[node_idx]->op)) { + continue; + } + nodes.push_back(cgraph->nodes[node_idx]); + names.push_back(ctx->query_fusion_names[node_idx]); + node_idx += ctx->query_fusion_node_count[node_idx]; + } + prev_node_idx = cur_node_idx; + ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); + } + } + ctx->perf_logger->print_timings(); + } + + if (!ctx->device->support_async) { + ggml_vk_synchronize(ctx); + } + + return GGML_STATUS_SUCCESS; + + UNUSED(backend); +} + +static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_event_t event) { + VK_LOG_DEBUG("ggml_backend_vk_event_record(backend=" << backend << ", event=" << event << ")"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + vk_event *vkev = (vk_event *)event->context; + + ggml_vk_submit_transfer_ctx(ctx); + + vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); + auto* cmd_buf = compute_ctx->s->buffer; // retrieve pointer before it gets reset + + if (vkev->has_event) { + // Move existing event into submitted + vkev->events_submitted.push_back(vkev->event); + } + + // Grab the next event and record it, create one if necessary + if (vkev->events_free.empty()) { + vkev->event = ctx->device->device.createEvent({}); + } else { + vkev->event = vkev->events_free.back(); + vkev->events_free.pop_back(); + } + + vkev->has_event = true; + + ggml_vk_set_event(compute_ctx, vkev->event); + + vkev->tl_semaphore.value++; + compute_ctx->s->signal_semaphores.push_back(vkev->tl_semaphore); + ggml_vk_ctx_end(compute_ctx); + + ggml_vk_submit(compute_ctx, {}); + ctx->submit_pending = true; + vkev->cmd_buffer = cmd_buf; + vkev->cmd_buffer_use_counter = cmd_buf->use_counter; + ctx->compute_ctx.reset(); +} + +static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { + VK_LOG_DEBUG("ggml_backend_vk_event_wait(backend=" << backend << ", event=" << event << ")"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + vk_event *vkev = (vk_event *)event->context; + + vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); + + if (vkev->has_event) { + // Wait for latest event + ggml_vk_wait_events(compute_ctx, { vkev->event }); + } +} + +static ggml_backend_i ggml_backend_vk_interface = { + /* .get_name = */ ggml_backend_vk_name, + /* .free = */ ggml_backend_vk_free, + /* .set_tensor_async = */ ggml_backend_vk_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_vk_get_tensor_async, + /* .set_tensor_2d_async = */ ggml_backend_vk_set_tensor_2d_async, + /* .get_tensor_2d_async = */ ggml_backend_vk_get_tensor_2d_async, + /* .cpy_tensor_async = */ ggml_backend_vk_cpy_tensor_async, + /* .synchronize = */ ggml_backend_vk_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_vk_graph_compute, + /* .event_record = */ ggml_backend_vk_event_record, + /* .event_wait = */ ggml_backend_vk_event_wait, + /* .graph_optimize = */ ggml_vk_graph_optimize, +}; + +static ggml_guid_t ggml_backend_vk_guid() { + static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; + return &guid; +} + +ggml_backend_t ggml_backend_vk_init(size_t dev_num) { + VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")"); + + ggml_backend_vk_context * ctx = new ggml_backend_vk_context; + ggml_vk_init(ctx, dev_num); + + ggml_backend_t vk_backend = new ggml_backend { + /* .guid = */ ggml_backend_vk_guid(), + /* .iface = */ ggml_backend_vk_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num), + /* .context = */ ctx, + }; + + if (!ctx->device->support_async) { + vk_backend->iface.get_tensor_async = nullptr; + } + + return vk_backend; +} + +bool ggml_backend_is_vk(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid()); +} + +int ggml_backend_vk_get_device_count() { + return ggml_vk_get_device_count(); +} + +void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) { + GGML_ASSERT(device < (int) vk_instance.device_indices.size()); + int dev_idx = vk_instance.device_indices[device]; + ggml_vk_get_device_description(dev_idx, description, description_size); +} + +void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { + GGML_ASSERT(device < (int) vk_instance.device_indices.size()); + GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size()); + + vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; + vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops; + vk::PhysicalDeviceMemoryProperties2 memprops = {}; + const bool membudget_supported = vk_instance.device_supports_membudget[device]; + const bool is_integrated_gpu = vkdev.getProperties().deviceType == vk::PhysicalDeviceType::eIntegratedGpu; + + if (membudget_supported) { + memprops.pNext = &budgetprops; + } + vkdev.getMemoryProperties2(&memprops); + + *total = 0; + *free = 0; + + for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) { + const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i]; + + if (is_integrated_gpu || (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal)) { + *total += heap.size; + + if (membudget_supported && i < budgetprops.heapUsage.size()) { + *free += budgetprops.heapBudget[i] - budgetprops.heapUsage[i]; + } else { + *free += heap.size; + } + } + } +} + +static vk::PhysicalDeviceType ggml_backend_vk_get_device_type(int device_idx) { + GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size()); + + vk::PhysicalDevice device = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device_idx]]; + + vk::PhysicalDeviceProperties2 props = {}; + device.getProperties2(&props); + + return props.properties.deviceType; +} + +static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { + GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size()); + + vk::PhysicalDevice device = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device_idx]]; + + const std::vector ext_props = device.enumerateDeviceExtensionProperties(); + + bool ext_support = false; + + for (const auto& properties : ext_props) { + if (strcmp("VK_EXT_pci_bus_info", properties.extensionName) == 0) { + ext_support = true; + break; + } + } + + if (!ext_support) { + return ""; + } + + vk::PhysicalDeviceProperties2 props = {}; + vk::PhysicalDevicePCIBusInfoPropertiesEXT pci_bus_info = {}; + + props.pNext = &pci_bus_info; + + device.getProperties2(&props); + + const uint32_t pci_domain = pci_bus_info.pciDomain; + const uint32_t pci_bus = pci_bus_info.pciBus; + const uint32_t pci_device = pci_bus_info.pciDevice; + const uint8_t pci_function = (uint8_t) pci_bus_info.pciFunction; // pci function is between 0 and 7, prevent printf overflow warning + + char pci_bus_id[16] = {}; + snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function); + + return std::string(pci_bus_id); +} + +static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + return ctx->name.c_str(); +} + +static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t dev) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + return ctx->description.c_str(); +} + +static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context; + ggml_backend_vk_get_device_memory(ctx->device, free, total); +} + +static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + return ggml_backend_vk_buffer_type(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(ggml_backend_dev_t dev) { + UNUSED(dev); + return ggml_backend_vk_host_buffer_type(); +} + +static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + + return ctx->is_integrated_gpu ? GGML_BACKEND_DEVICE_TYPE_IGPU : GGML_BACKEND_DEVICE_TYPE_GPU; +} + +static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + + props->name = ggml_backend_vk_device_get_name(dev); + props->description = ggml_backend_vk_device_get_description(dev); + props->type = ggml_backend_vk_device_get_type(dev); + props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); + ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ true, + /* .host_buffer = */ true, + /* .buffer_from_host_ptr = */ false, + /* .events = */ true, + }; +} + +static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) { + UNUSED(params); + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + return ggml_backend_vk_init(ctx->device); +} + +static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + const vk_device& device = ggml_vk_get_device(ctx->device); + + const bool uses_bda = (op->op == GGML_OP_IM2COL || op->op == GGML_OP_IM2COL_3D) && + device->shader_int64 && device->buffer_device_address; + + auto const & tensor_size_supported = [&](size_t tensor_size) { + if (tensor_size > device->max_buffer_size) { + return false; + } + // For im2col shaders using BDA, maxStorageBufferRange limit doesn't apply. + // If shader64BitIndexing is enabled, maxStorageBufferRange limit doesn't apply. + if (!uses_bda && !device->shader_64b_indexing) { + if (tensor_size > device->properties.limits.maxStorageBufferRange) { + return false; + } + } + return true; + }; + // reject any tensors larger than the max buffer size + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (op->src[i] && !tensor_size_supported(ggml_nbytes(op->src[i]))) { + return false; + } + } + if (!tensor_size_supported(ggml_nbytes(op))) { + return false; + } + + switch (op->op) { + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_EXPM1: + case GGML_UNARY_OP_ELU: + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_ERF: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_SILU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_XIELU: + case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_TANH: + case GGML_UNARY_OP_SIGMOID: + case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_SOFTPLUS: + case GGML_UNARY_OP_STEP: + case GGML_UNARY_OP_ROUND: + case GGML_UNARY_OP_CEIL: + case GGML_UNARY_OP_FLOOR: + case GGML_UNARY_OP_TRUNC: + case GGML_UNARY_OP_SGN: + return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && + (op->src[0]->type == op->type); + default: + return false; + } + case GGML_OP_GLU: + switch (ggml_get_glu_op(op)) { + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_SWIGLU: + case GGML_GLU_OP_SWIGLU_OAI: + case GGML_GLU_OP_GEGLU_ERF: + case GGML_GLU_OP_GEGLU_QUICK: + return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && + (op->src[0]->type == op->type) && + (!op->src[1] || op->src[1]->type == op->src[0]->type); + default: + return false; + } + case GGML_OP_MUL_MAT: + case GGML_OP_MUL_MAT_ID: + { + ggml_type src0_type = op->src[0]->type; + if (op->op == GGML_OP_MUL_MAT_ID) { + if (!device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) { + // If there's not enough shared memory for row_ids and the result tile, fallback to CPU + return false; + } + } + switch (src0_type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_MXFP4: + case GGML_TYPE_NVFP4: + break; + default: + return false; + } + struct ggml_tensor * a; + struct ggml_tensor * b; + if (op->op == GGML_OP_MUL_MAT) { + a = op->src[0]; + b = op->src[1]; + } else { + a = op->src[2]; + b = op->src[1]; + } + if (a->ne[3] != b->ne[3]) { + return false; + } + if (!(ggml_vk_dim01_contiguous(op->src[0]) || op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_BF16) || + !(ggml_vk_dim01_contiguous(op->src[1]) || op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16)) { + return false; + } + if (op->src[0]->type == GGML_TYPE_BF16 && op->src[1]->type == GGML_TYPE_F16) { + // We currently don't have a bf16 x f16 shader, or an fp16->bf16 copy shader. + // So don't support this combination for now. + return false; + } + + return true; + } + case GGML_OP_FLASH_ATTN_EXT: + { + bool coopmat2 = device->coopmat2; + uint32_t HSK = op->src[1]->ne[0]; + uint32_t HSV = op->src[2]->ne[0]; + if ((HSK % 8) != 0 || (HSV % 8) != 0) { + return false; + } + if (op->src[4] && op->src[4]->type != GGML_TYPE_F32) { + return false; + } + if (op->src[0]->type != GGML_TYPE_F32) { + return false; + } + if (op->type != GGML_TYPE_F32) { + return false; + } + if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) { + return false; + } + auto fa_kv_ok = [coopmat2](ggml_type t) { + switch (t) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_0: + return true; + case GGML_TYPE_Q1_0: + return coopmat2; + default: + return false; + } + }; + if (!fa_kv_ok(op->src[1]->type) || !fa_kv_ok(op->src[2]->type)) { + return false; + } + if ((op->src[1]->type == GGML_TYPE_BF16) != (op->src[2]->type == GGML_TYPE_BF16)) { + return false; + } + if (!coopmat2 && !(device->subgroup_shuffle && device->subgroup_vote)) { + // scalar/coopmat1 FA uses subgroupShuffle/subgroupAll + return false; + } + return true; + } + case GGML_OP_GET_ROWS: + { + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_MXFP4: + case GGML_TYPE_NVFP4: + case GGML_TYPE_I32: + return true; + default: + return false; + } + } + case GGML_OP_SET_ROWS: + { + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_IQ4_NL: + return true; + default: + return false; + } + } + case GGML_OP_CONT: + case GGML_OP_CPY: + case GGML_OP_DUP: + { + ggml_type src0_type = op->src[0]->type; + ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type; + + if (src0_type == GGML_TYPE_F32) { + switch (src1_type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_IQ4_NL: + return true; + default: + break; + } + } + if (src1_type == GGML_TYPE_F32) { + switch (src0_type) { + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_IQ4_NL: + return true; + default: + break; + } + } + + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return true; + } + + if ( + (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) || + (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32) + ) { + return true; + } + + // We can handle copying from a type to the same type if it's + // either not quantized or is quantized and contiguous. + // We use f16 or f32 shaders to do the copy, + // so the type/block size must be a multiple of 4. + if (src0_type == src1_type && + (!ggml_is_quantized(src0_type) || (ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op))) && + (ggml_type_size(src0_type) % 2) == 0) { + return true; + } + return false; + } + case GGML_OP_REPEAT: + return ggml_type_size(op->type) == ggml_type_size(op->src[0]->type) && + (ggml_type_size(op->type) == sizeof(float) || ggml_type_size(op->type) == 2); + case GGML_OP_REPEAT_BACK: + return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_ROPE: + return ggml_is_contiguous_rows(op) && ggml_is_contiguous_rows(op->src[0]); + case GGML_OP_ROPE_BACK: + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_RMS_NORM: + return true; + case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: + return ggml_is_contiguous(op->src[0]); + case GGML_OP_L2_NORM: + return ggml_is_contiguous_rows(op->src[0]) && + op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; + case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16) && + (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16); + case GGML_OP_ADD_ID: + return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->src[2]->type == GGML_TYPE_I32 && + op->type == GGML_TYPE_F32; + case GGML_OP_SILU_BACK: + case GGML_OP_RMS_NORM_BACK: + return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_SIN: + case GGML_OP_COS: + case GGML_OP_CLAMP: + return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_LEAKY_RELU: + case GGML_OP_OPT_STEP_ADAMW: + case GGML_OP_OPT_STEP_SGD: + return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_LOG: + case GGML_OP_TRI: + case GGML_OP_DIAG: + return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + op->type == op->src[0]->type; + case GGML_OP_ARGSORT: + { + if (!ggml_is_contiguous(op) || !ggml_is_contiguous(op->src[0])) { + return false; + } + // pipeline_argsort_large_f32 requires vulkan memory model. + if (device->vulkan_memory_model) { + return true; + } else { + return op->ne[0] <= (1 << device->max_workgroup_size_log2); + } + } + case GGML_OP_TOP_K: + { + if (!ggml_is_contiguous(op) || !ggml_is_contiguous(op->src[0])) { + return false; + } + // We could potentially support larger, using argsort to sort the + // whole thing. Not clear if this is needed. + uint32_t min_pipeline = (uint32_t)log2f(float(op->ne[0])) + 1; + if (min_pipeline >= num_topk_pipelines || + !device->pipeline_topk_f32[min_pipeline]) { + return false; + } + } + return true; + case GGML_OP_UPSCALE: + if (op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS) { + if ((op->op_params[0] & 0xFF) != GGML_SCALE_MODE_BILINEAR) { + return false; + } + } + return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_ACC: + return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; + case GGML_OP_SET: + return op->src[0]->type == op->src[1]->type && op->src[0]->type == op->type && + (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_I32); + case GGML_OP_CONCAT: { + if (op->src[0]->type != op->src[1]->type || op->src[0]->type != op->type) { + return false; + } + const size_t type_size = ggml_type_size(op->type); + return ggml_blck_size(op->type) == 1 && + (type_size == 1 || type_size == 2 || type_size == 4 || type_size == 8); + } + case GGML_OP_ADD1: + return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32) + || (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32) + || (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16); + case GGML_OP_ARANGE: + return op->type == GGML_TYPE_F32; + case GGML_OP_FILL: + return op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16; + case GGML_OP_SCALE: + return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_PAD: + case GGML_OP_ROLL: + return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_DIAG_MASK_INF: + return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_SOFT_MAX: + return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32 + && (!op->src[1] || (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16)); + case GGML_OP_SOFT_MAX_BACK: + return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32 + && ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32; + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous_rows(op->src[0]); + case GGML_OP_CUMSUM: + { + if (device->subgroup_arithmetic && device->subgroup_require_full_support) { + return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous_rows(op->src[0]); + } + return false; + } + case GGML_OP_SOLVE_TRI: + { + if (op->type != GGML_TYPE_F32 || op->src[0]->type != GGML_TYPE_F32) { + return false; + } + const uint32_t N = op->src[0]->ne[0]; + const uint32_t K = op->src[1]->ne[0]; + // K dimension limited to workgroup size + if (K > 1u << device->max_workgroup_size_log2) { + return false; + } + const uint32_t batch_N = device->properties.limits.maxComputeSharedMemorySize / ((N + K) * sizeof(float)); + + if (batch_N == 0) { + return false; + } + return true; + } + case GGML_OP_ARGMAX: + return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_COUNT_EQUAL: + return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_I32 + && ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_I32; + case GGML_OP_IM2COL: + return ggml_is_contiguous(op->src[1]) + && op->src[1]->type == GGML_TYPE_F32 + && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16); + case GGML_OP_IM2COL_3D: + return op->src[1]->type == GGML_TYPE_F32 + && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16); + case GGML_OP_TIMESTEP_EMBEDDING: + return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_CONV_2D_DW: + return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) + && op->src[1]->type == GGML_TYPE_F32; + case GGML_OP_POOL_2D: + return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_RWKV_WKV6: + case GGML_OP_RWKV_WKV7: + return true; // all inputs are contiguous, see ggml.c + case GGML_OP_GATED_DELTA_NET: + { + const uint32_t S_v = op->src[2]->ne[0]; + if (S_v != 16 && S_v != 32 && S_v != 64 && S_v != 128) { + return false; + } + for (int i = 0; i < 6; i++) { + if (op->src[i] == nullptr || op->src[i]->type != GGML_TYPE_F32) { + return false; + } + } + return op->type == GGML_TYPE_F32; + } + case GGML_OP_SSM_SCAN: + { + for (int i = 0; i < 6; i++) { + if (op->src[i] && ggml_is_quantized(op->src[i]->type)) { + return false; + } + } + if (op->src[6] && op->src[6]->type != GGML_TYPE_I32) { + return false; + } + if (op->src[0]->type != GGML_TYPE_F32 || op->type != GGML_TYPE_F32) { + return false; + } + + const uint32_t d_state = op->src[0]->ne[0]; + const uint32_t head_dim = op->src[0]->ne[1]; + + bool is_mamba2 = (op->src[3] && op->src[3]->nb[1] == sizeof(float)); + if (!is_mamba2) { + return false; + } + + if ((d_state != 128 && d_state != 256) || head_dim % 16 != 0) { + return false; + } + + size_t shmem_size = d_state * sizeof(float); + + if (shmem_size > device->properties.limits.maxComputeSharedMemorySize) { + return false; + } + + if (!device->subgroup_basic) { + return false; + } + + return true; + } + case GGML_OP_SSM_CONV: + return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_CONV_TRANSPOSE_1D: + return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + case GGML_OP_COL2IM_1D: + return (op->src[0]->type == GGML_TYPE_F32 || + op->src[0]->type == GGML_TYPE_F16 || + op->src[0]->type == GGML_TYPE_BF16) && + op->type == op->src[0]->type && + ggml_is_contiguous(op->src[0]) && + ggml_is_contiguous(op); + case GGML_OP_CONV_2D: + case GGML_OP_CONV_TRANSPOSE_2D: + { + // Channel-contiguous format is not supported yet. + return ((op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + op->src[1]->type == GGML_TYPE_F32 && + op->type == GGML_TYPE_F32 && + ggml_is_contiguous(op->src[0]) && + ggml_is_contiguous(op->src[1]) && + ggml_is_contiguous(op)); + } + default: + return false; + } + + UNUSED(dev); +} + +static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) { + return false; + } + + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; + + return buft_ctx->device->idx == ctx->device; +} + +static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context; + + return ggml_vk_get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size; +} + +static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + auto device = ggml_vk_get_device(ctx->device); + + vk_event *vkev = new vk_event; + if (!vkev) { + return nullptr; + } + + // No events initially, they get created on demand + vkev->has_event = false; + + vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; + vk::SemaphoreCreateInfo ci{}; + ci.setPNext(&tci); + vkev->tl_semaphore = { device->device.createSemaphore(ci), 0 }; + + return new ggml_backend_event { + /* .device = */ dev, + /* .context = */ vkev, + }; +} + +static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + auto device = ggml_vk_get_device(ctx->device); + + vk_event *vkev = (vk_event *)event->context; + + device->device.destroySemaphore(vkev->tl_semaphore.s); + for (auto& event : vkev->events_free) { + device->device.destroyEvent(event); + } + for (auto& event : vkev->events_submitted) { + device->device.destroyEvent(event); + } + if (vkev->has_event) { + device->device.destroyEvent(vkev->event); + } + delete vkev; + delete event; +} + +static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) { + VK_LOG_DEBUG("ggml_backend_vk_device_event_synchronize(backend=" << dev << ", event=" << event << ")"); + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + auto device = ggml_vk_get_device(ctx->device); + vk_event *vkev = (vk_event *)event->context; + + // Only do something if the event has actually been used + if (vkev->has_event) { + vk::Semaphore sem = vkev->tl_semaphore.s; + uint64_t val = vkev->tl_semaphore.value; + vk::SemaphoreWaitInfo swi{vk::SemaphoreWaitFlags{}, sem, val}; + VK_CHECK(device->device.waitSemaphores(swi, UINT64_MAX), "event_synchronize"); + + // Reset and move submitted events + for (auto& event : vkev->events_submitted) { + device->device.resetEvent(event); + } + vkev->events_free.insert(vkev->events_free.end(), vkev->events_submitted.begin(), vkev->events_submitted.end()); + vkev->events_submitted.clear(); + + // Finished using current command buffer so we flag for reuse + if (vkev->cmd_buffer) { + // Only flag for reuse if it hasn't been reused already + if (vkev->cmd_buffer_use_counter == vkev->cmd_buffer->use_counter) { + vkev->cmd_buffer->in_use = false; + vkev->cmd_buffer->buf.reset(); + } + vkev->cmd_buffer = nullptr; + } + } +} + +static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + VK_LOG_DEBUG("ggml_backend_vk_device_buffer_from_host_ptr(backend=" << dev << ", ptr=" << ptr << ", size=" << size << ")"); + GGML_UNUSED(max_tensor_size); + + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + auto device = ggml_vk_get_device(ctx->device); + + vk_buffer buf = ggml_vk_buffer_from_host_ptr(device, ptr, size); + + if (!buf) { + return {}; + } + + ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(device, std::move(buf), device->name); + + ggml_backend_buffer_t ret = ggml_backend_buffer_init(ggml_backend_vk_device_get_buffer_type(dev), ggml_backend_vk_buffer_interface, bufctx, size); + + return ret; +} + +static const struct ggml_backend_device_i ggml_backend_vk_device_i = { + /* .get_name = */ ggml_backend_vk_device_get_name, + /* .get_description = */ ggml_backend_vk_device_get_description, + /* .get_memory = */ ggml_backend_vk_device_get_memory, + /* .get_type = */ ggml_backend_vk_device_get_type, + /* .get_props = */ ggml_backend_vk_device_get_props, + /* .init_backend = */ ggml_backend_vk_device_init, + /* .get_buffer_type = */ ggml_backend_vk_device_get_buffer_type, + /* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ ggml_backend_vk_device_buffer_from_host_ptr, + /* .supports_op = */ ggml_backend_vk_device_supports_op, + /* .supports_buft = */ ggml_backend_vk_device_supports_buft, + /* .offload_op = */ ggml_backend_vk_device_offload_op, + /* .event_new = */ ggml_backend_vk_device_event_new, + /* .event_free = */ ggml_backend_vk_device_event_free, + /* .event_synchronize = */ ggml_backend_vk_device_event_synchronize, +}; + +static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) { + UNUSED(reg); + return GGML_VK_NAME; +} + +static size_t ggml_backend_vk_reg_get_device_count(ggml_backend_reg_t reg) { + UNUSED(reg); + return ggml_backend_vk_get_device_count(); +} + +static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, size_t device) { + static std::vector devices; + + static bool initialized = false; + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; + for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) { + ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context; + char desc[256]; + ggml_backend_vk_get_device_description(i, desc, sizeof(desc)); + ctx->device = i; + ctx->name = GGML_VK_NAME + std::to_string(i); + ctx->description = desc; + ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu; + ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i); + ctx->op_offload_min_batch_size = min_batch_size; + devices.push_back(new ggml_backend_device { + /* .iface = */ ggml_backend_vk_device_i, + /* .reg = */ reg, + /* .context = */ ctx, + }); + } + initialized = true; + } + } + + GGML_ASSERT(device < devices.size()); + return devices[device]; +} + +static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = { + /* .get_name = */ ggml_backend_vk_reg_get_name, + /* .get_device_count = */ ggml_backend_vk_reg_get_device_count, + /* .get_device = */ ggml_backend_vk_reg_get_device, + /* .get_proc_address = */ NULL, +}; + +ggml_backend_reg_t ggml_backend_vk_reg() { + static ggml_backend_reg reg = { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_vk_reg_i, + /* .context = */ nullptr, + }; + try { + ggml_vk_instance_init(); + return ® + } catch (const vk::SystemError& e) { + VK_LOG_DEBUG("ggml_backend_vk_reg() -> Error: System error: " << e.what()); + return nullptr; + } catch (const std::exception &e) { + VK_LOG_DEBUG("ggml_backend_vk_reg() -> Error: " << e.what()); + return nullptr; + } catch (...) { + VK_LOG_DEBUG("ggml_backend_vk_reg() -> Error: unknown exception during Vulkan init"); + return nullptr; + } +} + +GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg) + diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-buffers.cpp b/ggml/src/ggml-vulkan/ggml-vulkan-buffers.cpp new file mode 100644 index 0000000000..1f2e849da5 --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-buffers.cpp @@ -0,0 +1,783 @@ +#include "ggml-vulkan-common.h" + +ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { + /* .get_name = */ ggml_backend_vk_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size, + /* .is_host = */ NULL, +}; + +static std::vector ggml_vk_find_memory_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) { + std::vector indices; + + for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) { + vk::MemoryType memory_type = mem_props->memoryTypes[i]; + if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) && + (flags & memory_type.propertyFlags) == flags && + mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) { + indices.push_back(i); + } + } + return indices; +} + +static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list & req_flags_list, + void *import_ptr = nullptr) { + VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")"); + if (size > device->max_buffer_size) { + throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device buffer size limit"); + } + + vk_buffer buf = std::make_shared(); + + if (size == 0) { + buf->size = 0; + return buf; + } + + vk::BufferUsageFlags usage_flags = vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst; + vk::MemoryAllocateFlags mem_flags {}; + if (device->buffer_device_address) { + usage_flags |= vk::BufferUsageFlagBits::eShaderDeviceAddress; + mem_flags |= vk::MemoryAllocateFlagBits::eDeviceAddress; + } + + vk::BufferCreateInfo buffer_create_info{ + vk::BufferCreateFlags(), + size, + usage_flags, + vk::SharingMode::eExclusive, + 0, + nullptr, + }; + + vk::ExternalMemoryBufferCreateInfo external_memory_bci; + if (import_ptr) { + external_memory_bci.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT; + buffer_create_info.setPNext(&external_memory_bci); + } + + buf->buffer = device->device.createBuffer(buffer_create_info); + + vk::MemoryRequirements mem_req = device->device.getBufferMemoryRequirements(buf->buffer); + + vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties(); + + const vk::MemoryPriorityAllocateInfoEXT mem_priority_info { 1.0f }; + + vk::MemoryAllocateFlagsInfo mem_flags_info { mem_flags }; + + if (device->memory_priority) { + mem_flags_info.setPNext(&mem_priority_info); + } + + if (import_ptr) { + vk::MemoryHostPointerPropertiesEXT host_pointer_props; + try { + host_pointer_props = device->device.getMemoryHostPointerPropertiesEXT(vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, import_ptr); + } catch (vk::SystemError& e) { + GGML_LOG_WARN("ggml_vulkan: Failed getMemoryHostPointerPropertiesEXT (%s)\n", e.what()); + device->device.destroyBuffer(buf->buffer); + return {}; + } + vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties(); + + uint32_t memory_type_idx; + vk::MemoryPropertyFlags property_flags = *req_flags_list.begin(); + for (memory_type_idx = 0; memory_type_idx < 32; ++memory_type_idx) { + if (!(host_pointer_props.memoryTypeBits & (1u << memory_type_idx))) { + continue; + } + if (!(mem_req.memoryTypeBits & (1u << memory_type_idx))) { + continue; + } + + vk::MemoryType memory_type = mem_props.memoryTypes[memory_type_idx]; + // check for visible+coherent+cached. Other flags (e.g. devicelocal) are allowed + if ((memory_type.propertyFlags & property_flags) == property_flags) { + property_flags = memory_type.propertyFlags; + break; + } + } + if (memory_type_idx == 32) { + GGML_LOG_WARN("ggml_vulkan: Memory type for host allocation not found\n"); + device->device.destroyBuffer(buf->buffer); + return {}; + } + + buf->memory_property_flags = mem_props.memoryTypes[memory_type_idx].propertyFlags; + try { + vk::ImportMemoryHostPointerInfoEXT import_info; + import_info.handleType = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT; + import_info.pHostPointer = import_ptr; + import_info.setPNext(&mem_flags_info); + buf->device_memory = device->device.allocateMemory({ size, memory_type_idx, &import_info }); + } catch (const vk::SystemError& e) { + } + } else { + for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) { + const auto & req_flags = *it; + + const std::vector memory_type_indices = ggml_vk_find_memory_properties(&mem_props, &mem_req, req_flags); + + if (memory_type_indices.empty()) { + continue; + } + + bool done = false; + + for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) { + try { + buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info }); + buf->memory_property_flags = mem_props.memoryTypes[*mtype_it].propertyFlags; + done = true; + break; + } catch (const vk::SystemError& e) { + // loop and retry + // during last attempt throw the exception + if (it + 1 == req_flags_list.end() && mtype_it + 1 == memory_type_indices.end()) { + device->device.destroyBuffer(buf->buffer); + throw e; + } + } + } + + if (done) { + break; + } + } + } + + if (!buf->device_memory) { + device->device.destroyBuffer(buf->buffer); + throw vk::OutOfDeviceMemoryError("No suitable memory type found"); + } + + buf->ptr = nullptr; + + if (import_ptr) { + buf->ptr = import_ptr; + } else { + if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { + buf->ptr = device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE); + } + } + + device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0); + + buf->device = device; + buf->size = size; + + if (device->buffer_device_address) { + const vk::BufferDeviceAddressInfo addressInfo(buf->buffer); + buf->bda_addr = device->device.getBufferAddress(addressInfo); + } + + device->memory_logger->log_allocation(buf, size); + + return buf; +} + +vk_buffer ggml_vk_create_buffer_check(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags) { + try { + return ggml_vk_create_buffer(device, size, {req_flags, fallback_flags}); + } catch (const vk::SystemError& e) { + std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl; + std::cerr << "ggml_vulkan: " << e.what() << std::endl; + throw e; + } +} + +vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) { + vk_buffer buf; + try { + if (device->prefer_host_memory) { + buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, + vk::MemoryPropertyFlagBits::eDeviceLocal}); + } else if (device->uma) { + // On UMA, prefer host-visible memory so direct tensor borrowing works. + // If unavailable, fall back to device-local memory. + buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, + vk::MemoryPropertyFlagBits::eDeviceLocal, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); + } else if (device->disable_host_visible_vidmem) { + if (device->allow_sysmem_fallback) { + buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); + } else { + buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal}); + } + } else { + // use rebar if available, otherwise fallback to device only visible memory + if (device->allow_sysmem_fallback) { + buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, + vk::MemoryPropertyFlagBits::eDeviceLocal, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); + } else { + buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, + vk::MemoryPropertyFlagBits::eDeviceLocal}); + } + } + } catch (const vk::SystemError& e) { + std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl; + std::cerr << "ggml_vulkan: " << e.what() << std::endl; + throw e; + } + + return buf; +} + +void ggml_vk_destroy_buffer(vk_buffer& buf) { + if (buf == nullptr) { + return; + } + + if (buf->device != nullptr) { + buf->device->memory_logger->log_deallocation(buf); + } + + buf.reset(); +} + +void * ggml_vk_host_malloc(vk_device& device, size_t size) { + VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")"); + vk_buffer buf = ggml_vk_create_buffer(device, size, + {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); + + if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { + fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n", + size/1024.0/1024.0); + device->device.freeMemory(buf->device_memory); + device->device.destroyBuffer(buf->buffer); + return nullptr; + } + + std::lock_guard guard(device->pinned_memory_mutex); + device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf)); + + return buf->ptr; +} + +void ggml_vk_host_free(vk_device& device, void* ptr) { + if (ptr == nullptr) { + return; + } + VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")"); + std::lock_guard guard(device->pinned_memory_mutex); + + vk_buffer buf; + size_t index; + for (size_t i = 0; i < device->pinned_memory.size(); i++) { + const uint8_t* addr = (const uint8_t*) std::get<0>(device->pinned_memory[i]); + const uint8_t* endr = addr + std::get<1>(device->pinned_memory[i]); + if (ptr >= addr && ptr < endr) { + buf = std::get<2>(device->pinned_memory[i]); + index = i; + break; + } + } + if (buf == nullptr) { + fprintf(stderr, "WARNING: failed to free pinned memory: memory not in map\n"); + return; + } + + ggml_vk_destroy_buffer(buf); + + device->pinned_memory.erase(device->pinned_memory.begin() + index); +} + +void ggml_vk_host_get(const vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) { + std::shared_lock guard(device->pinned_memory_mutex); + buf = nullptr; + buf_offset = 0; + for (size_t i = 0; i < device->pinned_memory.size(); i++) { + const uint8_t* addr = (const uint8_t*) std::get<0>(device->pinned_memory[i]); + const uint8_t* endr = addr + std::get<1>(device->pinned_memory[i]); + if (ptr >= addr && ptr < endr) { + buf = std::get<2>(device->pinned_memory[i]); + buf_offset = ((const uint8_t *)ptr) - addr; + break; + } + } +} + +void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) { + if (device->sync_staging == nullptr || device->sync_staging->size < size) { + VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")"); + ggml_vk_destroy_buffer(device->sync_staging); + device->sync_staging = ggml_vk_create_buffer_check(device, size, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + } +} + +void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) { + if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) { + VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")"); + ggml_vk_destroy_buffer(ctx->sync_staging); + ctx->sync_staging = ggml_vk_create_buffer_check(ctx->device, size, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + } +} + +static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { + VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")"); + GGML_ASSERT(!ggml_is_contiguous(tensor)); + // Buffer is already mapped + if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { + std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl; + GGML_ABORT("fatal error"); + } + // Check if src is pinned memory + vk_buffer buf = nullptr; + size_t buf_offset = 0; + ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset); + + const uint64_t ne0 = tensor->ne[0]; + const uint64_t ne1 = tensor->ne[1]; + const uint64_t ne2 = tensor->ne[2]; + const uint64_t ne3 = tensor->ne[3]; + const uint64_t nb0 = tensor->nb[0]; + const uint64_t nb1 = tensor->nb[1]; + const uint64_t nb2 = tensor->nb[2]; + const uint64_t nb3 = tensor->nb[3]; + const ggml_type type = tensor->type; + const uint64_t ts = ggml_type_size(type); + const uint64_t bs = ggml_blck_size(type); + + const uint64_t dstnb0 = ts; + const uint64_t dstnb1 = dstnb0*(ne0/bs); + const uint64_t dstnb2 = dstnb1*ne1; + const uint64_t dstnb3 = dstnb2*ne2; + + const uint64_t ne = ggml_nelements(tensor); + + if (buf != nullptr) { + // Memory is pinned, use as staging buffer + std::vector slices; + + for (uint64_t i3 = 0; i3 < ne3; i3++) { + for (uint64_t i2 = 0; i2 < ne2; i2++) { + // Find longest contiguous slice + if (ne1*nb1 == dstnb2) { + slices.push_back({ buf_offset + i3*nb3 + i2*nb2, offset + i3*dstnb3 + i2*dstnb2, dstnb2 }); + } else { + for (uint64_t i1 = 0; i1 < ne1; i1++) { + if (ne0*nb0/bs == dstnb1) { + slices.push_back({ buf_offset + i3*nb3 + i2*nb2 + i1*nb1, offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, dstnb1 }); + } else { + const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1; + const uint64_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1; + for (uint64_t i0 = 0; i0 < ne0; i0++) { + slices.push_back({ s_off + i0*nb0, d_off + i0*dstnb0, dstnb0 }); + } + } + } + } + } + } + + ggml_vk_sync_buffers(ctx, subctx); + subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices); + return; + } + + if (!sync_staging) { + GGML_ABORT("Asynchronous write to non-pinned memory not supported"); + } + + // Staging buffer required + vk_buffer& staging = ctx->device->sync_staging; + const uint64_t copy_size = ts*ne/bs; + ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size); + VkBufferCopy buf_copy{ 0, offset, copy_size }; + + ggml_vk_sync_buffers(ctx, subctx); + vkCmdCopyBuffer(subctx->s->buffer->buf, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy); + + for (uint64_t i3 = 0; i3 < ne3; i3++) { + for (uint64_t i2 = 0; i2 < ne2; i2++) { + // Find longest contiguous slice + if (ne1*nb1 == dstnb2) { + deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys); + } else { + for (uint64_t i1 = 0; i1 < ne1; i1++) { + if (ne0*nb0/bs == dstnb1) { + deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys); + } else { + const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1; + const uint64_t d_off = i3*dstnb3 + i2*dstnb2 + i1*dstnb1; + for (uint64_t i0 = 0; i0 < ne0; i0++) { + deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys); + } + } + } + } + } + } +} + +bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging) { + VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")"); + // Check if src is pinned memory + vk_buffer buf = nullptr; + size_t buf_offset = 0; + ggml_vk_host_get(dst->device, src, buf, buf_offset); + + if (buf != nullptr) { + // Memory is pinned, use as staging buffer + std::vector slices(1); + if (width == spitch && width == dpitch) { + // Only do single write if stride is equal + slices[0].srcOffset = buf_offset; + slices[0].dstOffset = offset; + slices[0].size = width * height; + } else { + slices.resize(height); + for (size_t i = 0; i < height; i++) { + slices[i].srcOffset = buf_offset + i * spitch; + slices[i].dstOffset = offset + i * dpitch; + slices[i].size = width; + } + } + + ggml_vk_sync_buffers(nullptr, subctx); + subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices); + return true; + } + VK_LOG_DEBUG("STAGING"); + + if (!sync_staging) { + // copy was not handled caller needs to fall back + return false; + } + + // Staging buffer required + const size_t staging_size = width * height; + ggml_vk_ensure_sync_staging_buffer(dst->device, staging_size); + + vk_buffer& staging_buffer = dst->device->sync_staging; + + std::vector slices(1); + if (width == dpitch) { + slices[0].srcOffset = 0; + slices[0].dstOffset = offset; + slices[0].size = staging_size; + } else { + slices.resize(height); + for (size_t i = 0; i < height; i++) { + slices[i].srcOffset = i * width; + slices[i].dstOffset = offset + i * dpitch; + slices[i].size = width; + } + } + + ggml_vk_sync_buffers(nullptr, subctx); + subctx->s->buffer->buf.copyBuffer((VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, slices); + + if (width == spitch) { + deferred_memcpy((uint8_t *)staging_buffer->ptr, src, staging_size, &subctx->in_memcpys); + } else { + for (size_t i = 0; i < height; i++) { + deferred_memcpy((uint8_t *)staging_buffer->ptr + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys); + } + } + return true; +} + +bool ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging) { + VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")"); + return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, size, 1, sync_staging); +} + +void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height) { + VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")"); + // Buffer is already mapped + if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { + GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); + + if (width == spitch && width == dpitch) { + memcpy((uint8_t *)dst->ptr + offset, src, width * height); + } else { + for (size_t i = 0; i < height; i++) { + memcpy((uint8_t *)dst->ptr + offset + i * dpitch, (const uint8_t *) src + i * spitch, width); + } + } + } else { + std::lock_guard guard(dst->device->mutex); + + vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool); + ggml_vk_ctx_begin(dst->device, subctx); + bool ret = ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, dpitch, width, height, true); + GGML_ASSERT(ret); + ggml_vk_ctx_end(subctx); + + for (auto& cpy : subctx->in_memcpys) { + memcpy(cpy.dst, cpy.src, cpy.n); + } + + for (auto& mset : subctx->memsets) { + memset(mset.dst, mset.val, mset.n); + } + + ggml_vk_submit(subctx, dst->device->fence); + VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); + dst->device->device.resetFences({ dst->device->fence }); + ggml_vk_queue_command_pools_cleanup(dst->device); + } +} + +void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src, size_t size) { + VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")"); + ggml_vk_buffer_write_2d(dst, offset, src, size, size, size, 1); +} + +bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging) { + VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")"); + GGML_ASSERT(width > 0); + GGML_ASSERT(height > 0); + GGML_ASSERT(src != nullptr); + + // TODO: staging_offset is not used + + // Check if dst is pinned memory + vk_buffer buf = nullptr; + size_t buf_offset = 0; + ggml_vk_host_get(src->device, dst, buf, buf_offset); + + std::vector slices(1); + if (width == spitch && width == dpitch) { + // Only do single write if stride is equal + slices[0].srcOffset = offset; + slices[0].dstOffset = buf_offset; + slices[0].size = width * height; + } else { + slices.resize(height); + for (size_t i = 0; i < height; i++) { + slices[i].srcOffset = offset + i * spitch; + slices[i].dstOffset = buf_offset + i * dpitch; + slices[i].size = width; + } + } + + if (buf != nullptr) { + // Memory is pinned, use as staging buffer + ggml_vk_sync_buffers(nullptr, subctx); + subctx->s->buffer->buf.copyBuffer(src->buffer, buf->buffer, slices); + + return true; + } + VK_LOG_DEBUG("STAGING"); + + if (!sync_staging) { + // copy was not handled caller needs to fall back + return false; + } + + // Fall back to staging buffer + const size_t staging_size = width * height; + ggml_vk_ensure_sync_staging_buffer(src->device, staging_size); + + vk_buffer& staging_buffer = src->device->sync_staging; + + std::vector staging_slices(1); + if (width == spitch) { + staging_slices[0].srcOffset = offset; + staging_slices[0].dstOffset = 0; + staging_slices[0].size = staging_size; + } else { + staging_slices.resize(height); + for (size_t i = 0; i < height; i++) { + staging_slices[i].srcOffset = offset + i * spitch; + staging_slices[i].dstOffset = i * width; + staging_slices[i].size = width; + } + } + + ggml_vk_sync_buffers(nullptr, subctx); + subctx->s->buffer->buf.copyBuffer(src->buffer, staging_buffer->buffer, staging_slices); + + if (width == dpitch) { + deferred_memcpy(dst, staging_buffer->ptr, staging_size, &subctx->out_memcpys); + } else { + for (size_t i = 0; i < height; i++) { + deferred_memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) staging_buffer->ptr + i * width, width, &subctx->out_memcpys); + } + } + return true; +} + +static bool ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t size, bool sync_staging = false) { + return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, sync_staging); +} + +void ggml_vk_buffer_read_2d(vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height) { + VK_LOG_DEBUG("ggml_vk_buffer_read_2d(" << src->buffer << ", " << offset << ", " << width << ", " << height << ")"); + + // If the device is not an UMA device the memory is host-accessible through rebar. While writing + // through PCIe is sufficient fast reading back data from PCIe is slower than going through + // the HW device to host copy path. + if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) { + GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); + + std::lock_guard guard(src->device->mutex); + vk_context subctx = ggml_vk_create_temporary_context(src->device->compute_queue.cmd_pool); + ggml_vk_ctx_begin(src->device, subctx); + subctx->s->buffer->buf.pipelineBarrier( + vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer, + vk::PipelineStageFlagBits::eHost, + {}, + { { vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferWrite, + vk::AccessFlagBits::eHostRead } }, + {}, {}); + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, src->device->fence); + VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), + "vk_buffer_read_2d uma waitForFences"); + src->device->device.resetFences({ src->device->fence }); + ggml_vk_queue_command_pools_cleanup(src->device); + + if (width == spitch && width == dpitch) { + memcpy(dst, (const uint8_t *) src->ptr + offset, width * height); + } else { + for (size_t i = 0; i < height; i++) { + memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) src->ptr + offset + i * spitch, width); + } + } + } else { + std::lock_guard guard(src->device->mutex); + + vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool); + ggml_vk_ctx_begin(src->device, subctx); + bool ret = ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, spitch, dpitch, width, height, true); + GGML_ASSERT(ret); + ggml_vk_ctx_end(subctx); + + ggml_vk_submit(subctx, src->device->fence); + VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read_2d waitForFences"); + src->device->device.resetFences({ src->device->fence }); + ggml_vk_queue_command_pools_cleanup(src->device); + + for (auto& cpy : subctx->out_memcpys) { + memcpy(cpy.dst, cpy.src, cpy.n); + } + } +} + +void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) { + VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")"); + ggml_vk_buffer_read_2d(src, offset, dst, size, size, size, 1); +} + +void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { + VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")"); + // Make sure both buffers are on same device + GGML_ASSERT(src->device == dst->device); + + VkBufferCopy bc{ src_offset, dst_offset, size }; + + vkCmdCopyBuffer(ctx->s->buffer->buf, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc); +} + +void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { + if (src->device == dst->device) { + std::lock_guard guard(src->device->mutex); + VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")"); + // Copy within the device + vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool); + ggml_vk_ctx_begin(src->device, subctx); + ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size); + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, src->device->fence); + VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences"); + src->device->device.resetFences({ src->device->fence }); + ggml_vk_queue_command_pools_cleanup(src->device); + } else { + VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")"); + // Copy device to device + ggml_vk_ensure_sync_staging_buffer(src->device, size); + + // Copy to src staging buffer + ggml_vk_buffer_copy(src->device->sync_staging, 0, src, src_offset, size); + // Copy to dst buffer + ggml_vk_buffer_write(dst, dst_offset, src->device->sync_staging->ptr, size); + } +} + +void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) { + VK_LOG_DEBUG("ggml_vk_buffer_memset_async(" << offset << ", " << c << ", " << size << ")"); + + if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && + dst->device->uma) { + deferred_memset((uint8_t*)dst->ptr + offset, c, size, &ctx->memsets); + return; + } + + // Fall back to GPU fillBuffer for non-UMA or non-host-visible buffers + ctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c); +} + +void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) { + VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")"); + + if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && + dst->device->uma) { + memset((uint8_t*)dst->ptr + offset, c, size); + return; + } + + std::lock_guard guard(dst->device->mutex); + vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool); + ggml_vk_ctx_begin(dst->device, subctx); + subctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c); + ggml_vk_ctx_end(subctx); + + ggml_vk_submit(subctx, dst->device->fence); + VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences"); + dst->device->device.resetFences({ dst->device->fence }); + ggml_vk_queue_command_pools_cleanup(dst->device); +} + +ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { + /* .free_buffer = */ ggml_backend_vk_buffer_free_buffer, + /* .get_base = */ ggml_backend_vk_buffer_get_base, + /* .init_tensor = */ ggml_backend_vk_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_vk_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_vk_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_vk_buffer_get_tensor, + /* .set_tensor_2d = */ ggml_backend_vk_buffer_set_tensor_2d, + /* .get_tensor_2d = */ ggml_backend_vk_buffer_get_tensor_2d, + /* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor, + /* .clear = */ ggml_backend_vk_buffer_clear, + /* .reset = */ NULL, +}; + +vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size) { + if (!device->external_memory_host) { + return {}; + } + + uintptr_t uptr = reinterpret_cast(ptr); + if (uptr & (device->min_imported_host_pointer_alignment - 1)) { + return {}; + } + if (size & (device->min_imported_host_pointer_alignment - 1)) { + return {}; + } + + const vk::MemoryPropertyFlags property_flags = vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached; + + vk_buffer buf {}; + try { + buf = ggml_vk_create_buffer(device, size, { property_flags }, ptr); + } catch (vk::SystemError& e) { + GGML_LOG_WARN("ggml_vulkan: Failed ggml_vk_create_buffer (%s)\n", e.what()); + } + + return buf; +} + diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-common.h b/ggml/src/ggml-vulkan/ggml-vulkan-common.h new file mode 100644 index 0000000000..31b88477ad --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-common.h @@ -0,0 +1,228 @@ +#pragma once +#include "ggml-vulkan-push-constants.h" + +extern std::mutex queue_mutex; +extern ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface; +extern bool vk_memory_logger_enabled; +extern bool vk_perf_logger_enabled; +extern bool vk_perf_logger_concurrent; +extern bool vk_enable_sync_logger; +extern uint32_t vk_perf_logger_frequency; +extern std::string vk_pipeline_stats_filter; +extern void * const vk_ptr_base; +extern vk_instance_t vk_instance; +extern ggml_backend_buffer_i ggml_backend_vk_buffer_interface; + +uint64_t vk_tensor_offset(const ggml_tensor * tensor); +uint32_t get_misalign_bytes(const ggml_backend_vk_context * ctx, const ggml_tensor * t); +void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx); +void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline); +void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n); +void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx); +void ggml_vk_submit(vk_context& ctx, vk::Fence fence); +uint32_t ggml_vk_find_queue_family_index(std::vector& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues); +void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags, bool transfer_only); +vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p); +vk_context ggml_vk_create_temporary_context(vk_command_pool& p); +void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p); +void ggml_vk_queue_command_pools_cleanup(vk_device& device); +vk_buffer ggml_vk_create_buffer_check(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)); +vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size); +void ggml_vk_destroy_buffer(vk_buffer& buf); +vk_subbuffer ggml_vk_subbuffer(const ggml_backend_vk_context* ctx, const vk_buffer& buf, size_t offset = 0); +void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx); +void ggml_vk_set_event(vk_context& ctx, vk::Event& event); +void ggml_vk_wait_events(vk_context& ctx, std::vector&& events); +vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc); +vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc, bool use_mask, bool use_mask_opt, bool use_logit_softcap, ggml_type k_type, ggml_type v_type); +uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch); +void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested = nullptr); +vk_device ggml_vk_get_device(size_t idx); +DispatchLoaderDynamic & ggml_vk_default_dispatcher(); +void ggml_vk_instance_init(); +void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx); +vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type); +void * ggml_vk_host_malloc(vk_device& device, size_t size); +void ggml_vk_host_free(vk_device& device, void* ptr); +void ggml_vk_host_get(const vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset); +vk_subbuffer ggml_vk_tensor_subbuffer( const ggml_backend_vk_context * ctx, const ggml_tensor * tensor, bool allow_misalign = false); +void ggml_vk_ctx_end(vk_context& ctx); +void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx); +vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx); +bool ggml_vk_submit_transfer_ctx(ggml_backend_vk_context * ctx); +size_t ggml_vk_align_size(size_t width, size_t align); +void deferred_memcpy(void * dst, const void * src, size_t size, std::vector* memcpys = nullptr); +void deferred_memset(void * dst, uint32_t val, size_t size, std::vector* memsets = nullptr); +void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size); +void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size); +bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false); +bool ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false); +void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height); +void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src, size_t size); +bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false); +void ggml_vk_buffer_read_2d(vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height); +void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size); +void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size); +void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size); +void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size); +void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size); +void ggml_vk_matmul( ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t padded_n); +bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor); +vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to); +void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, const vk_subbuffer & in, const vk_subbuffer & out); +vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type); +void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& subctx, const vk_subbuffer & in, const vk_subbuffer & out, uint32_t ne); +bool ggml_vk_can_use_fwht(const ggml_backend_vk_context * ctx, const ggml_tensor * src1, const ggml_tensor * dst); +void ggml_vk_fwht(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src, ggml_tensor * dst); +void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx); +bool ggml_vk_use_mul_mat_vec_id(const struct ggml_cgraph * cgraph, int node_idx); +void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx); +bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type, ggml_type v_type); +bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type = GGML_TYPE_F16); +void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst); +void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx); +void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst); +void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst); +void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst); +void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst); +void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst); +void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx); +void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst); +void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst); +void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_add1(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_arange(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst); +void ggml_vk_fill(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst); +void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_log(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +uint32_t ggml_vk_rms_partials_size(ggml_backend_vk_context * ctx, const ggml_tensor *node); +void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, float * op_params); +void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_xielu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst); +void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx); +void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop); +void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_cumsum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_solve_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_col2im_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_snake_dispatch_fused(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx); +void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst); +void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx); +bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool last_node, bool almost_ready, bool submit); +void ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool almost_ready); +void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx); +void ggml_vk_cleanup(ggml_backend_vk_context * ctx); +int ggml_vk_get_device_count(); +void ggml_vk_get_device_description(int device, char * description, size_t description_size); +bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer); +void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer); +void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer); +enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor); +void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); +void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size); +void ggml_backend_vk_buffer_set_tensor_2d(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data); +void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size); +void ggml_backend_vk_buffer_get_tensor_2d(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data); +bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst); +void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value); +const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft); +ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size); +size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft); +size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft); +size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor); +void ggml_backend_vk_free(ggml_backend_t backend); +void ggml_vk_synchronize(ggml_backend_vk_context * ctx); +bool ggml_vk_is_empty(ggml_tensor * node); +bool ggml_vk_can_fuse(const ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list ops); +bool ggml_vk_can_fuse_ssm_conv(const ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx, int num_extra); +bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx, topk_moe_mode mode); +bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx); +bool ggml_vk_can_fuse_snake(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx); +bool ggml_vk_tensors_overlap(const ggml_tensor * a, const ggml_tensor * b, bool elementwise); +bool ggml_vk_can_fuse_rms_norm_mul_rope(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx); +uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx); +void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * graph); +int64_t ggml_vk_get_op_batch_size(const ggml_tensor * op); +vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size); +ggml_backend_reg_t ggml_backend_vk_reg(); +bool ggml_vk_instance_layer_settings_available(); +bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector& instance_extensions); +bool ggml_vk_instance_debug_utils_ext_available(const std::vector & instance_extensions); +bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev); +bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch); +uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev); + +template +static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list const& descriptor_buffer_infos, const T &push_constants, std::array elements) { + const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); + const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); + const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); + VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {"; + for (auto& buffer : descriptor_buffer_infos) { + std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), "; + } + std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"); + GGML_ASSERT(wg0 <= ctx->device->properties.limits.maxComputeWorkGroupCount[0] && + wg1 <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] && + wg2 <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]); + GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size()); + GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT); + GGML_ASSERT(pipeline->parameter_count == descriptor_buffer_infos.size()); + GGML_ASSERT(pipeline->push_constant_size == push_constant_size(push_constants)); + + vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++]; + vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() }; + ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {}); + + subctx->s->buffer->buf.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants)); + subctx->s->buffer->buf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); + subctx->s->buffer->buf.bindDescriptorSets(vk::PipelineBindPoint::eCompute, + pipeline->layout, + 0, + { descriptor_set }, + {}); + subctx->s->buffer->buf.dispatch(wg0, wg1, wg2); +} + diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-debug.cpp b/ggml/src/ggml-vulkan/ggml-vulkan-debug.cpp new file mode 100644 index 0000000000..dd99d41299 --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-debug.cpp @@ -0,0 +1,1608 @@ +#include "ggml-vulkan-common.h" + +bool vk_memory_logger_enabled = false; + +bool vk_perf_logger_enabled = false; + +bool vk_perf_logger_concurrent = false; + +bool vk_enable_sync_logger = false; + +uint32_t vk_perf_logger_frequency = 1; + +std::string vk_pipeline_stats_filter; + +void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) { + if (!vk_memory_logger_enabled) { + return; + } + std::lock_guard guard(log_mutex); + vk_buffer buf = buf_ref.lock(); + const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); + const std::string type = device ? "device" : "host"; + allocations[buf->buffer] = size; + total_device += device ? size : 0; + total_host += device ? 0 : size; + VK_LOG_MEMORY(buf->device->name << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); +} + +void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) { + if (buf_ref.expired() || buf_ref.lock()->size == 0 || !vk_memory_logger_enabled) { + return; + } + + std::lock_guard guard(log_mutex); + vk_buffer buf = buf_ref.lock(); + const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); + std::string type = device ? "device" : "host"; + auto it = allocations.find(buf->buffer); + if (it != allocations.end()) { + total_device -= device ? it->second : 0; + total_host -= device ? 0 : it->second; + VK_LOG_MEMORY(buf->device->name << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); + allocations.erase(it); + } else { + VK_LOG_MEMORY("ERROR " << buf->device->name << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer); + } +} + +#ifdef GGML_VULKAN_CHECK_RESULTS +static size_t vk_skip_checks; +static size_t vk_output_tensor; + +static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name); +static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx); +static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx); +#endif + +#ifdef GGML_VULKAN_RUN_TESTS +static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) { + if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) { + return; + } + i0 = std::max(i0, 5); + i1 = std::max(i1, 5); + i2 = std::max(i2, 0); + fprintf(stderr, " "); + for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { + fprintf(stderr, "%7d ", idx1); + } + fprintf(stderr, "\n"); + for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) { + fprintf(stderr, "%7d: ", idx0); + for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { + if (idx0 >= 0 && idx0 < ne0 && idx1 >= 0 && idx1 < ne1) { + float val; + if (type == GGML_TYPE_F32) { + val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0); + } else if (type == GGML_TYPE_F16) { + val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0)); + } else { + GGML_ABORT("fatal error"); + } + fprintf(stderr, "% 7.2f ", val); + } else { + fprintf(stderr, " "); + } + } + fprintf(stderr, "\n"); + } +} + +template +static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) { + VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")"); + const size_t x_ne = m * k * batch; + const size_t y_ne = k * n * batch; + const size_t d_ne = m * n * batch; + + vk_pipeline p; + std::string shname; + if (shader_size == 0) { + if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f32->a_s; + shname = "F32_ALIGNED_S"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f32_f16->a_s; + shname = "F32_F16_ALIGNED_S"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_s; + shname = "F16_F32_ALIGNED_S"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f16.f32acc->a_s; + shname = "F16_ALIGNED_S"; + } else { + GGML_ABORT("fatal error"); + } + } else if (shader_size == 1) { + if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f32->a_m; + shname = "F32_ALIGNED_M"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f32_f16->a_m; + shname = "F32_F16_ALIGNED_M"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_m; + shname = "F16_F32_ALIGNED_M"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f16.f32acc->a_m; + shname = "F16_ALIGNED_M"; + } else { + GGML_ABORT("fatal error"); + } + } else if (shader_size == 2) { + if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f32->a_l; + shname = "F32_ALIGNED_L"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f32_f16->a_l; + shname = "F32_F16_ALIGNED_L"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_l; + shname = "F16_F32_ALIGNED_L"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f16.f32acc->a_l; + shname = "F16_ALIGNED_L"; + } else { + GGML_ABORT("fatal error"); + } + } else { + GGML_ASSERT(0); + } + + const size_t kpad = ggml_vk_align_size(k, p->align); + + if (k != kpad) { + if (shader_size == 0) { + if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f32->s; + shname = "F32_S"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f32_f16->s; + shname = "F32_F16_S"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f16_f32.f32acc->s; + shname = "F16_F32_S"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f16.f32acc->s; + shname = "F16_S"; + } + } else if (shader_size == 1) { + if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f32->m; + shname = "F32_M"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f32_f16->m; + shname = "F32_F16_M"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f16_f32.f32acc->m; + shname = "F16_F32_M"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f16.f32acc->m; + shname = "F16_M"; + } + } else if (shader_size == 2) { + if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f32->l; + shname = "F32_L"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f32_f16->l; + shname = "F32_F16_L"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f16_f32.f32acc->l; + shname = "F16_F32_L"; + } else if (std::is_same() && std::is_same()) { + p = ctx->device->pipeline_matmul_f16.f32acc->l; + shname = "F16_L"; + } + } + } + + if (split_k > 1) { + ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it); + + if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { + // Resize buffer + if (ctx->prealloc_split_k != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_split_k); + } + ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, {vk::MemoryPropertyFlagBits::eDeviceLocal}); + } + } + + ggml_pipeline_allocate_descriptor_sets(ctx); + + vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, {vk::MemoryPropertyFlagBits::eDeviceLocal}); + vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, {vk::MemoryPropertyFlagBits::eDeviceLocal}); + vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, {vk::MemoryPropertyFlagBits::eDeviceLocal}); + + X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne); + Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne); + float* d = (float *) malloc(sizeof(float) * d_ne); + + for (size_t i = 0; i < x_ne; i++) { + if (std::is_same()) { + x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; + // x[i] = 1.0f; + // x[i] = i + 1; + // x[i] = (i % k == i / k) ? 1.0f : 0.0f; + } else if (std::is_same()) { + x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); + // x[i] = ggml_fp32_to_fp16(1.0f); + // x[i] = ggml_fp32_to_fp16(i + 1); + // x[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); + } else { + GGML_ABORT("fatal error"); + } + } + for (size_t i = 0; i < y_ne; i++) { + if (std::is_same()) { + y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; + // y[i] = (i % k == i / k) ? 1.0f : 0.0f; + // y[i] = i + 1; + } else if (std::is_same()) { + y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); + // y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); + // y[i] = ggml_fp32_to_fp16(i + 1); + } else { + GGML_ABORT("fatal error"); + } + } + + ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch); + ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); + + vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); + ggml_vk_ctx_begin(ctx->device, subctx); + for (size_t i = 0; i < num_it; i++) { + ggml_vk_matmul( + ctx, subctx, p, ggml_vk_subbuffer(ctx, d_X), ggml_vk_subbuffer(ctx, d_Y), ggml_vk_subbuffer(ctx, d_D), ggml_vk_subbuffer(ctx, ctx->prealloc_split_k), + m, n, k, + k, k, m, k*m, k*n, m*n, + split_k, batch, batch, batch, 1, 1, n + ); + } + ggml_vk_ctx_end(subctx); + + auto begin = std::chrono::high_resolution_clock::now(); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences"); + ctx->device->device.resetFences({ ctx->fence }); + ggml_vk_queue_command_pools_cleanup(ctx->device); + + auto end = std::chrono::high_resolution_clock::now(); + double time = std::chrono::duration_cast(end-begin).count() / 1000.0; + + // copy dst to host + ggml_vk_buffer_read(d_D, 0, d, sizeof(float) * d_ne); + + float * d_chk = (float *) malloc(sizeof(float) * d_ne); + + ggml_init_params iparams = { + /*.mem_size =*/ 1024*1024*1024, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + ggml_context * ggml_ctx = ggml_init(iparams); + + ggml_type src0_type; + ggml_type src1_type; + + if (std::is_same()) { + src0_type = GGML_TYPE_F32; + } else if (std::is_same()) { + src0_type = GGML_TYPE_F16; + } else { + GGML_ABORT("fatal error"); + } + if (std::is_same()) { + src1_type = GGML_TYPE_F32; + } else if (std::is_same()) { + src1_type = GGML_TYPE_F16; + } else { + GGML_ABORT("fatal error"); + } + + ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch); + ggml_tensor * src1_ggml = ggml_new_tensor_3d(ggml_ctx, src1_type, k, n, batch); + ggml_tensor * tensor_ggml = ggml_mul_mat(ggml_ctx, src0_ggml, src1_ggml); + + src0_ggml->data = x; + src1_ggml->data = y; + tensor_ggml->data = d_chk; + + ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); + ggml_build_forward_expand(cgraph, tensor_ggml); + + ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1); + + ggml_free(ggml_ctx); + + double avg_err = 0.0; + int first_err_n = -1; + int first_err_m = -1; + int first_err_b = -1; + + for (size_t i = 0; i < m*n*batch; i++) { + double err = std::fabs(d[i] - d_chk[i]); + avg_err += err; + + if ((err > 0.05f || std::isnan(err)) && first_err_n == -1) { + first_err_b = i / (m * n); + first_err_n = (i % (m * n)) / m; + first_err_m = (i % (m * n)) % m; + } + } + + avg_err /= m * n; + + double tflops = 2.0*m*n*k*batch*num_it / (time / 1000.0) / (1000.0*1000.0*1000.0*1000.0); + + std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl; + + if (avg_err > 0.1 || std::isnan(avg_err)) { + std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; + std::cerr << "Actual result: " << std::endl << std::endl; + ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); + std::cerr << "Expected result: " << std::endl << std::endl; + ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); + + if (split_k > 1) { + float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k); + ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k); + + std::cerr << "d_buf0: " << std::endl << std::endl; + ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); + + std::cerr << "d_buf1: " << std::endl << std::endl; + ggml_vk_print_matrix_area(split_k_buf + d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); + + std::cerr << "d_buf2: " << std::endl << std::endl; + ggml_vk_print_matrix_area(split_k_buf + 2 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); + + std::cerr << "d_buf3: " << std::endl << std::endl; + ggml_vk_print_matrix_area(split_k_buf + 3 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); + + free(split_k_buf); + } + } + + free(d_chk); + + ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool); + + ggml_vk_destroy_buffer(d_X); + ggml_vk_destroy_buffer(d_Y); + ggml_vk_destroy_buffer(d_D); + + free(x); + free(y); + free(d); +} + +static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1, int i2, int i3) { + if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) { + return; + } + i0 = std::max(i0, 5); + i1 = std::max(i1, 5); + i2 = std::max(i2, 0); + i3 = std::max(i3, 0); + fprintf(stderr, " "); + for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { + fprintf(stderr, "%7d ", idx1); + } + fprintf(stderr, "\n"); + for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) { + fprintf(stderr, "%7d: ", idx0); + for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { + if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) { + float val; + if (tensor->type == GGML_TYPE_F32) { + val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); + } else if (tensor->type == GGML_TYPE_F16) { + val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); + } else { + GGML_ABORT("fatal error"); + } + fprintf(stderr, "% 7.2f ", val); + } else { + fprintf(stderr, " "); + } + } + fprintf(stderr, "\n"); + } +} + +static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) { + ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr); +} + +static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) { + if (quant == GGML_TYPE_F32) { + memcpy(to, from, sizeof(float) * ne); + return; + } + + const auto * tt = ggml_get_type_traits(quant); + + ggml_to_float_t dequant_fn = tt->to_float; + + dequant_fn(from, to, ne); +} + +static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { + VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")"); + const size_t x_sz = sizeof(float) * ne; + const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne; + const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant); + float * x = (float *) malloc(x_sz); + void * qx = malloc(qx_sz); + vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); + vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz_f16, {vk::MemoryPropertyFlagBits::eDeviceLocal}); + float * x_ref = (float *) malloc(x_sz); + ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16); + + for (size_t i = 0; i < ne; i++) { + x[i] = rand() / (float)RAND_MAX; + } + + vk_pipeline p = ggml_vk_get_to_fp16(ctx, quant); + + ggml_vk_quantize_data(x, qx, ne, quant); + ggml_vk_dequantize_data(qx, x_ref, ne, quant); + + ggml_pipeline_request_descriptor_sets(ctx, p, 1); + + ggml_pipeline_allocate_descriptor_sets(ctx); + + ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); + + vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); + ggml_vk_ctx_begin(ctx->device, subctx); + const std::vector pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne }; + ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1}); + ggml_vk_ctx_end(subctx); + + auto begin = std::chrono::high_resolution_clock::now(); + + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences"); + ctx->device->device.resetFences({ ctx->fence }); + ggml_vk_queue_command_pools_cleanup(ctx->device); + + auto end = std::chrono::high_resolution_clock::now(); + + double ms_dequant = std::chrono::duration_cast(end-begin).count() / 1000.0; + ggml_vk_buffer_read(x_buf, 0, x_chk, x_sz_f16); + + int first_err = -1; + + double avg_err = 0.0; + for (size_t i = 0; i < ne; i++) { + double error = std::fabs(x_ref[i] - ggml_fp16_to_fp32(x_chk[i])); + avg_err += error; + + if (first_err < 0 && error > 0.05) { + first_err = i; + } + } + + avg_err /= ne; + + std::cerr << "TEST DEQUANT " << ggml_type_name(quant) << " time=" << ms_dequant << "ms avg_err=" << avg_err << std::endl; + + if (avg_err > 0.1) { + std::cerr << "first_error = " << first_err << std::endl; + std::cerr << "Actual result: " << std::endl << std::endl; + for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) { + std::cerr << ggml_fp16_to_fp32(x_chk[i]) << ", "; + } + std::cerr << std::endl << "Expected result: " << std::endl << std::endl; + for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) { + std::cerr << x_ref[i] << ", "; + } + std::cerr << std::endl; + } + + ggml_vk_destroy_buffer(x_buf); + ggml_vk_destroy_buffer(qx_buf); + + free(x); + free(qx); + free(x_ref); + free(x_chk); +} + +// This does not work without ggml q8_1 quantization support +// +// typedef uint16_t ggml_half; +// typedef uint32_t ggml_half2; +// +// #define QK8_1 32 +// typedef struct { +// union { +// struct { +// ggml_half d; // delta +// ggml_half s; // d * sum(qs[i]) +// } GGML_COMMON_AGGR_S; +// ggml_half2 ds; +// } GGML_COMMON_AGGR_U; +// int8_t qs[QK8_1]; // quants +// } block_q8_1; +// +// static void ggml_vk_test_quantize(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { +// VK_LOG_DEBUG("ggml_vk_test_quantize(" << ne << ")"); +// GGML_ASSERT(quant == GGML_TYPE_Q8_1); +// +// const size_t x_sz = sizeof(float) * ne; +// const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant); +// float * x = (float *) malloc(x_sz); +// block_q8_1 * qx = (block_q8_1 *)malloc(qx_sz); +// block_q8_1 * qx_res = (block_q8_1 *)malloc(qx_sz); +// vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); +// vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); +// +// for (size_t i = 0; i < ne; i++) { +// x[i] = rand() / (float)RAND_MAX; +// } +// +// vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant); +// +// ggml_pipeline_request_descriptor_sets(ctx, p, 1); +// +// ggml_pipeline_allocate_descriptor_sets(ctx); +// +// ggml_vk_buffer_write(x_buf, 0, x, x_sz); +// +// vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); +// ggml_vk_ctx_begin(ctx->device, subctx); +// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, x_buf), ggml_vk_subbuffer(ctx, qx_buf), ne); +// ggml_vk_ctx_end(subctx); +// +// auto begin = std::chrono::high_resolution_clock::now(); +// +// ggml_vk_submit(subctx, ctx->fence); +// VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences"); +// ctx->device->device.resetFences({ ctx->fence }); +// ggml_vk_queue_command_pools_cleanup(ctx->device); +// +// auto end = std::chrono::high_resolution_clock::now(); +// +// double ms_quant = std::chrono::duration_cast(end-begin).count() / 1000.0; +// ggml_vk_buffer_read(qx_buf, 0, qx, qx_sz); +// +// ggml_vk_quantize_data(x, qx_res, ne, quant); +// +// int first_err = -1; +// +// for (size_t i = 0; i < ne / 32; i++) { +// double error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d)); +// +// if (first_err < 0 && error > 0.1) { +// first_err = i; +// } +// +// error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s)); +// +// if (first_err < 0 && error > 0.1) { +// first_err = i; +// } +// +// for (size_t j = 0; j < 32; j++) { +// uint64_t error = std::abs(qx_res[i].qs[j] - qx[i].qs[j]); +// +// if (first_err < 0 && error > 1) { +// first_err = i; +// } +// } +// } +// +// std::cerr << "TEST QUANTIZE " << ggml_type_name(quant) << " time=" << ms_quant << "ms " << (first_err == -1 ? "CORRECT" : "INCORRECT") << std::endl; +// +// if (first_err != -1) { +// std::cerr << "first_error = " << first_err << std::endl; +// std::cerr << "Actual result: " << std::endl << std::endl; +// std::cout << "d=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " "; +// for (size_t j = 0; j < 32; j++) { +// std::cout << " qs" << j << "=" << (uint32_t)qx[first_err].qs[j] << " "; +// } +// std::cerr << std::endl << std::endl << "Expected result: " << std::endl << std::endl; +// std::cout << "d=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " "; +// for (size_t j = 0; j < 32; j++) { +// std::cout << " qs" << j << "=" << (uint32_t)qx_res[first_err].qs[j] << " "; +// } +// std::cerr << std::endl; +// } +// +// ggml_vk_destroy_buffer(x_buf); +// ggml_vk_destroy_buffer(qx_buf); +// +// free(x); +// free(qx); +// free(qx_res); +// } + +static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant, bool mmq = false) { + VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")"); + const size_t x_ne = m * k * batch; + const size_t y_ne = k * n * batch; + const size_t d_ne = m * n * batch; + + vk_matmul_pipeline2 * pipelines; + + if (mmq) { + pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1; + } else { + pipelines = ctx->device->pipeline_dequant_mul_mat_mat; + } + + const bool fp16acc = ctx->device->fp16; + + vk_pipeline p; + std::string shname; + if (shader_size == 0) { + p = fp16acc ? pipelines[quant].f16acc->a_s : pipelines[quant].f32acc->a_s; + shname = std::string(ggml_type_name(quant)) + "_ALIGNED_S"; + } else if (shader_size == 1) { + p = fp16acc ? pipelines[quant].f16acc->a_m : pipelines[quant].f32acc->a_m; + shname = std::string(ggml_type_name(quant)) + "_ALIGNED_M"; + } else if (shader_size == 2) { + p = fp16acc ? pipelines[quant].f16acc->a_l : pipelines[quant].f32acc->a_l; + shname = std::string(ggml_type_name(quant)) + "_ALIGNED_L"; + } else { + GGML_ASSERT(0); + } + + const size_t kpad = mmq ? 0 : ggml_vk_align_size(k, p->align); + + if (mmq || k != kpad) { + if (shader_size == 0) { + p = fp16acc ? pipelines[quant].f16acc->s : pipelines[quant].f32acc->s; + shname = std::string(ggml_type_name(quant)) + "_S"; + } else if (shader_size == 1) { + p = fp16acc ? pipelines[quant].f16acc->m : pipelines[quant].f32acc->m; + shname = std::string(ggml_type_name(quant)) + "_M"; + } else if (shader_size == 2) { + p = fp16acc ? pipelines[quant].f16acc->l : pipelines[quant].f32acc->l; + shname = std::string(ggml_type_name(quant)) + "_L"; + } else { + GGML_ASSERT(0); + } + } + + if (p == nullptr) { + std::cerr << "error: no pipeline for ggml_vk_test_dequant_matmul " << ggml_type_name(quant) << std::endl; + return; + } + + const size_t x_sz = sizeof(float) * x_ne; + const size_t y_sz = sizeof(float) * y_ne; + const size_t qx_sz = x_ne * ggml_type_size(quant)/ggml_blck_size(quant); + const size_t qy_sz = mmq ? y_ne * ggml_type_size(GGML_TYPE_Q8_1)/ggml_blck_size(GGML_TYPE_Q8_1) : y_sz; + const size_t d_sz = sizeof(float) * d_ne; + float * x = (float *) malloc(x_sz); + float * y = (float *) malloc(y_sz); + void * qx = malloc(qx_sz); + vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); + vk_buffer y_buf = ggml_vk_create_buffer_check(ctx->device, y_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); + vk_buffer qy_buf = ggml_vk_create_buffer_check(ctx->device, qy_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); + vk_buffer d_buf = ggml_vk_create_buffer_check(ctx->device, d_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); + float * d = (float *) malloc(d_sz); + float * d_chk = (float *) malloc(d_sz); + + for (size_t i = 0; i < x_ne; i++) { + x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; + // x[i] = (i % k == i / k) ? 1.0f : 0.0f; + // x[i] = i % k; + } + + ggml_vk_quantize_data(x, qx, x_ne, quant); + + for (size_t i = 0; i < y_ne; i++) { + y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; + // y[i] = (i % k == i / k) ? 1.0f : 0.0f; + // y[i] = i % k; + } + + if (split_k > 1) { + ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it); + + if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { + // Resize buffer + if (ctx->prealloc_split_k != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_split_k); + } + ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, {vk::MemoryPropertyFlagBits::eDeviceLocal}); + } + } + if (mmq) { + vk_pipeline pipeline_quantize_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline_quantize_q8_1, num_it); + } + + ggml_pipeline_allocate_descriptor_sets(ctx); + + ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); + ggml_vk_buffer_write(y_buf, 0, y, y_sz); + + vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); + ggml_vk_ctx_begin(ctx->device, subctx); + if (mmq) { + for (size_t i = 0; i < num_it; i++) { + ggml_vk_quantize_q8_1(ctx, subctx, { y_buf, 0, y_sz }, { qy_buf, 0, qy_sz }, y_ne); + ggml_vk_matmul( + ctx, subctx, p, { qx_buf, 0, qx_sz }, { qy_buf, 0, qy_sz }, { d_buf, 0, d_sz }, { ctx->prealloc_split_k, 0, ctx->prealloc_size_split_k }, + m, n, k, + k, k, m, k*m, k*n, m*n, + split_k, batch, batch, batch, 1, 1, n + ); + } + } else { + for (size_t i = 0; i < num_it; i++) { + ggml_vk_matmul( + ctx, subctx, p, { qx_buf, 0, qx_sz }, { y_buf, 0, y_sz }, { d_buf, 0, d_sz }, { ctx->prealloc_split_k, 0, ctx->prealloc_size_split_k }, + m, n, k, + k, k, m, k*m, k*n, m*n, + split_k, batch, batch, batch, 1, 1, n + ); + } + } + ggml_vk_ctx_end(subctx); + + auto begin = std::chrono::high_resolution_clock::now(); + + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences"); + ctx->device->device.resetFences({ ctx->fence }); + ggml_vk_queue_command_pools_cleanup(ctx->device); + + auto end = std::chrono::high_resolution_clock::now(); + + double time_ms = std::chrono::duration_cast(end-begin).count() / 1000.0; + ggml_vk_buffer_read(d_buf, 0, d, d_sz); + + ggml_init_params iparams = { + /*.mem_size =*/ 1024*1024*1024, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + ggml_context * ggml_ctx = ggml_init(iparams); + + ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, quant, k, m, batch); + ggml_tensor * src1_ggml = ggml_new_tensor_3d(ggml_ctx, GGML_TYPE_F32, k, n, batch); + ggml_tensor * tensor_ggml = ggml_mul_mat(ggml_ctx, src0_ggml, src1_ggml); + + src0_ggml->data = qx; + src1_ggml->data = y; + tensor_ggml->data = d_chk; + + ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); + ggml_build_forward_expand(cgraph, tensor_ggml); + + ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1); + + ggml_free(ggml_ctx); + + double avg_err = 0.0; + int first_err_n = -1; + int first_err_m = -1; + int first_err_b = -1; + + for (size_t i = 0; i < m*n*batch; i++) { + double err = std::fabs(d[i] - d_chk[i]); + avg_err += err; + + if ((err > 0.05f || std::isnan(err)) && first_err_n == -1) { + first_err_b = i / (m * n); + first_err_n = (i % (m * n)) / m; + first_err_m = (i % (m * n)) % m; + } + } + + avg_err /= m * n; + + double tflops = 2.0*m*n*k*batch*num_it / (time_ms / 1000.0) / (1000.0*1000.0*1000.0*1000.0); + + std::cerr << "TEST dequant matmul " << shname; + if (mmq) { + std::cerr << " mmq"; + } + std::cerr << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl; + + if (avg_err > 0.01 || std::isnan(avg_err)) { + std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; + std::cerr << "Actual result: " << std::endl << std::endl; + ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); + std::cerr << std::endl; + std::cerr << "Expected result: " << std::endl << std::endl; + ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); + + std::cerr << "src0: " << std::endl << std::endl; + ggml_vk_print_matrix_area(x, GGML_TYPE_F32, k, m, first_err_m, first_err_n, first_err_b); + std::cerr << std::endl; + std::cerr << "src1: " << std::endl << std::endl; + ggml_vk_print_matrix_area(y, GGML_TYPE_F32, k, n, first_err_m, first_err_n, first_err_b); + + if (split_k > 1) { + float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k); + ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k); + + std::cerr << "d_buf0: " << std::endl << std::endl; + ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); + + std::cerr << "d_buf1: " << std::endl << std::endl; + ggml_vk_print_matrix_area(split_k_buf + d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); + + std::cerr << "d_buf2: " << std::endl << std::endl; + ggml_vk_print_matrix_area(split_k_buf + 2 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); + + std::cerr << "d_buf3: " << std::endl << std::endl; + ggml_vk_print_matrix_area(split_k_buf + 3 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); + + free(split_k_buf); + } + } + + ggml_vk_destroy_buffer(qx_buf); + ggml_vk_destroy_buffer(y_buf); + ggml_vk_destroy_buffer(qy_buf); + ggml_vk_destroy_buffer(d_buf); + + free(x); + free(qx); + free(y); + free(d); + free(d_chk); +} +#endif + +int64_t ggml_vk_get_op_batch_size(const ggml_tensor * op) { + switch (op->op) { + case GGML_OP_GET_ROWS: + return 0; + case GGML_OP_MUL_MAT: + return op->ne[1]; + case GGML_OP_MUL_MAT_ID: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + return op->ne[2]; + default: + return ggml_nrows(op); + } +} + +#ifdef GGML_VULKAN_CHECK_RESULTS +static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector& done, int level = 0) { + if (std::find(done.begin(), done.end(), tensor) != done.end() || level > 10) { + return; + } + for (int j = 0; j < level; j++) { + std::cerr << " "; + } + std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl; + + done.push_back(tensor); + + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (tensor->src[i] != nullptr) { + ggml_vk_print_graph_origin(tensor->src[i], done, level + 1); + } + } +} + +static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) { + if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16 && tensor->type != GGML_TYPE_I32) { + return; + } + i0 = std::max(i0, 5); + i1 = std::max(i1, 5); + i2 = std::max(i2, 0); + i3 = std::max(i3, 0); + fprintf(stderr, " "); + for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { + fprintf(stderr, "%7d ", idx1); + } + fprintf(stderr, "\n"); + for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) { + fprintf(stderr, "%7d: ", idx0); + for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { + if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) { + float val; + if (tensor->type == GGML_TYPE_F32) { + val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); + } else if (tensor->type == GGML_TYPE_F16) { + val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); + } else if (tensor->type == GGML_TYPE_I32) { + val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); + } else { + GGML_ABORT("fatal error"); + } + fprintf(stderr, "% 7.2f ", val); + } else { + fprintf(stderr, " "); + } + } + fprintf(stderr, "\n"); + } +} + +static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) { + void * tensor_data = tensor->data; + + const bool is_gpu = tensor->buffer != nullptr && ggml_backend_buffer_is_vk(tensor->buffer); + + if (is_gpu) { + const size_t tensor_size = ggml_nbytes(tensor); + tensor_data = malloc(tensor_size); + + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; + + vk_buffer buffer_gpu = buf_ctx->dev_buffer; + ggml_vk_buffer_read(buffer_gpu, vk_tensor_offset(tensor) + tensor->view_offs, tensor_data, tensor_size); + } + + std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; + std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl; + if (tensor->src[0] != nullptr) { + std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl; + } + if (tensor->src[1] != nullptr) { + std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl; + } + std::cerr << std::endl << "Result:" << std::endl; + ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0); + std::cerr << std::endl; + std::vector done; + ggml_vk_print_graph_origin(tensor, done); + + if (is_gpu) { + free(tensor_data); + } +} + +void * comp_result; +size_t comp_size; +size_t comp_nb[GGML_MAX_DIMS]; +size_t check_counter = 0; +static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) { + ggml_tensor * tensor = cgraph->nodes[tensor_idx + ctx->num_additional_fused_ops]; + if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) { + return; + } + + check_counter++; + if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { + return; + } + + VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")"); + + struct ggml_init_params iparams = { + /*.mem_size =*/ 2ul*1024ul*1024ul*1024ul, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ggml_ctx = ggml_init(iparams); + + std::array src_clone = {nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}; + const char * srci_name[GGML_MAX_SRC] = {"src0", "src1", "src2", "src3", "src4", "src5", "src6", "src7", "src8", "src9"}; + + std::map cloned_tensors; + std::vector cloned_mallocs; + + struct ggml_tensor * tensor_clone = nullptr; + + for (int f = 0; f < ctx->num_additional_fused_ops + 1; ++f) { + tensor = cgraph->nodes[tensor_idx + f]; + for (int i = 0; i < GGML_MAX_SRC; i++) { + ggml_tensor * srci = tensor->src[i]; + if (srci == nullptr) { + continue; + } + // If a src tensor has been cloned, use that one + auto it = cloned_tensors.find(srci); + if (it != cloned_tensors.end()) { + src_clone[i] = it->second; + continue; + } + ggml_tensor * srci_clone = ggml_dup_tensor(ggml_ctx, srci); + size_t srci_size = ggml_nbytes(srci); + + src_clone[i] = srci_clone; + void *src_buffer = malloc(srci_size); + cloned_mallocs.push_back(src_buffer); + + srci_clone->data = src_buffer; + if (ggml_backend_buffer_is_host(srci->buffer)) { + memcpy(srci_clone->data, srci->data, srci_size); + memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS); + } else if (ggml_backend_buffer_is_vk(srci->buffer)) { + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)srci->buffer->context; + vk_buffer& buffer_gpu = buf_ctx->dev_buffer; + uint64_t offset = vk_tensor_offset(srci) + srci->view_offs; + if (!ggml_is_contiguous(srci) && ggml_vk_dim01_contiguous(srci)) { + for (int i3 = 0; i3 < srci->ne[3]; i3++) { + for (int i2 = 0; i2 < srci->ne[2]; i2++) { + const int idx = i3*srci->ne[2] + i2; + ggml_vk_buffer_read(buffer_gpu, offset + idx * srci->nb[2], ((char *)srci_clone->data + idx * srci_clone->nb[2]), srci->ne[1] * srci->nb[1]); + } + } + + srci_clone->nb[0] = srci->nb[0]; + srci_clone->nb[1] = srci->nb[1]; + for (int i = 2; i < GGML_MAX_DIMS; i++) { + srci_clone->nb[i] = srci_clone->nb[i - 1]*srci_clone->ne[i - 1]; + } + } else { + if (offset + srci_size >= buffer_gpu->size) { + srci_size = buffer_gpu->size - offset; + } + ggml_vk_buffer_read(buffer_gpu, offset, srci_clone->data, srci_size); + memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS); + } + } else { + GGML_ABORT("fatal error"); + } + + if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { + ggml_vk_print_tensor(srci, srci_name[i]); + } + } + + if (tensor->op == GGML_OP_FLASH_ATTN_EXT) { + const float * params = (const float *)tensor->op_params; + tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]); + if (src_clone[4]) { + ggml_flash_attn_ext_add_sinks(tensor_clone, src_clone[4]); + } + } else if (tensor->op == GGML_OP_MUL_MAT) { + tensor_clone = ggml_mul_mat(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_MUL_MAT_ID) { + tensor_clone = ggml_mul_mat_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]); + } else if (tensor->op == GGML_OP_SUB) { + tensor_clone = ggml_sub(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_MUL) { + tensor_clone = ggml_mul(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_DIV) { + tensor_clone = ggml_div(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_CONCAT) { + tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params); + } else if (tensor->op == GGML_OP_UPSCALE) { + tensor_clone = ggml_interpolate(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]); + } else if (tensor->op == GGML_OP_SCALE) { + const float * params = (const float *)tensor->op_params; + tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]); + } else if (tensor->op == GGML_OP_ADD1) { + tensor_clone = ggml_add1(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_ARANGE) { + const float start = ggml_get_op_params_f32(tensor, 0); + const float stop = ggml_get_op_params_f32(tensor, 1); + const float step = ggml_get_op_params_f32(tensor, 2); + tensor_clone = ggml_arange(ggml_ctx, start, stop, step); + } else if (tensor->op == GGML_OP_FILL) { + const float value = ggml_get_op_params_f32(tensor, 0); + tensor_clone = ggml_fill(ggml_ctx, src_clone[0], value); + } else if (tensor->op == GGML_OP_SQR) { + tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_SQRT) { + tensor_clone = ggml_sqrt(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_SIN) { + tensor_clone = ggml_sin(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_COS) { + tensor_clone = ggml_cos(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_LOG) { + tensor_clone = ggml_log(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_TRI) { + tensor_clone = ggml_tri(ggml_ctx, src_clone[0], (ggml_tri_type)ggml_get_op_params_i32(tensor, 0)); + } else if (tensor->op == GGML_OP_DIAG) { + tensor_clone = ggml_diag(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_CLAMP) { + const float * params = (const float *)tensor->op_params; + tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]); + } else if (tensor->op == GGML_OP_PAD) { + tensor_clone = ggml_pad_ext(ggml_ctx, src_clone[0], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3], + tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]); + } else if (tensor->op == GGML_OP_REPEAT) { + tensor_clone = ggml_repeat(ggml_ctx, src_clone[0], tensor); + } else if (tensor->op == GGML_OP_REPEAT_BACK) { + tensor_clone = ggml_repeat_back(ggml_ctx, src_clone[0], tensor); + } else if (tensor->op == GGML_OP_ADD) { + tensor_clone = ggml_add(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_ACC) { + tensor_clone = ggml_acc(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]); + } else if (tensor->op == GGML_OP_SET) { + tensor_clone = ggml_set(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]); + } else if (tensor->op == GGML_OP_NORM) { + tensor_clone = ggml_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params); + } else if (tensor->op == GGML_OP_GROUP_NORM) { + const float * float_params = (const float *)tensor->op_params; + tensor_clone = ggml_group_norm(ggml_ctx, src_clone[0], tensor->op_params[0], float_params[1]); + } else if (tensor->op == GGML_OP_RMS_NORM) { + tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params); + } else if (tensor->op == GGML_OP_RMS_NORM_BACK) { + const float eps = ((float *) tensor->op_params)[0]; + tensor_clone = ggml_rms_norm_back(ggml_ctx, src_clone[0], src_clone[1], eps); + } else if (tensor->op == GGML_OP_SILU_BACK) { + tensor_clone = ggml_silu_back(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_L2_NORM) { + const float eps = ((float *) tensor->op_params)[0]; + tensor_clone = ggml_l2_norm(ggml_ctx, src_clone[0], eps); + } else if (tensor->op == GGML_OP_SOFT_MAX) { + if (tensor->src[1] != nullptr) { + const float * params = (const float *)tensor->op_params; + tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], params[0], params[1]); + } else { + tensor_clone = ggml_soft_max(ggml_ctx, src_clone[0]); + } + } else if (tensor->op == GGML_OP_SOFT_MAX_BACK) { + tensor_clone = ggml_soft_max_ext_back(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); + } else if (tensor->op == GGML_OP_DIAG_MASK_INF) { + tensor_clone = ggml_diag_mask_inf(ggml_ctx, src_clone[0], tensor->op_params[0]); + } else if (tensor->op == GGML_OP_ROPE || tensor->op == GGML_OP_ROPE_BACK) { + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + //const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3]; + const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4]; + const float freq_base = ((float *) tensor->op_params)[5]; + const float freq_scale = ((float *) tensor->op_params)[6]; + const float ext_factor = ((float *) tensor->op_params)[7]; + const float attn_factor = ((float *) tensor->op_params)[8]; + const float beta_fast = ((float *) tensor->op_params)[9]; + const float beta_slow = ((float *) tensor->op_params)[10]; + if (mode & GGML_ROPE_TYPE_MROPE) { + int32_t *sections = ((int32_t *) tensor->op_params) + 11; + if (tensor->op == GGML_OP_ROPE) { + tensor_clone = ggml_rope_multi(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + } else { + tensor_clone = ggml_rope_multi_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + } + } else { + if (tensor->op == GGML_OP_ROPE) { + tensor_clone = ggml_rope_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + } else { + tensor_clone = ggml_rope_ext_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + } + } + } else if (tensor->op == GGML_OP_UNARY) { + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_EXP: + tensor_clone = ggml_exp(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_EXPM1: + tensor_clone = ggml_expm1(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_ELU: + tensor_clone = ggml_elu(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_SILU: + tensor_clone = ggml_silu(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_GELU: + tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_GELU_ERF: + tensor_clone = ggml_gelu_erf(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_GELU_QUICK: + tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_RELU: + tensor_clone = ggml_relu(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_XIELU: + tensor_clone = ggml_xielu(ggml_ctx, src_clone[0], 0, 0, 0, 0); + ggml_set_op_params_f32(tensor_clone, 1, ggml_get_op_params_f32(tensor, 1)); + ggml_set_op_params_f32(tensor_clone, 2, ggml_get_op_params_f32(tensor, 2)); + ggml_set_op_params_f32(tensor_clone, 3, ggml_get_op_params_f32(tensor, 3)); + ggml_set_op_params_f32(tensor_clone, 4, ggml_get_op_params_f32(tensor, 4)); + break; + case GGML_UNARY_OP_NEG: + tensor_clone = ggml_neg(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_TANH: + tensor_clone = ggml_tanh(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_SIGMOID: + tensor_clone = ggml_sigmoid(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_HARDSIGMOID: + tensor_clone = ggml_hardsigmoid(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_HARDSWISH: + tensor_clone = ggml_hardswish(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_ABS: + tensor_clone = ggml_abs(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_SOFTPLUS: + tensor_clone = ggml_softplus(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_STEP: + tensor_clone = ggml_step(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_ROUND: + tensor_clone = ggml_round(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_CEIL: + tensor_clone = ggml_ceil(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_FLOOR: + tensor_clone = ggml_floor(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_TRUNC: + tensor_clone = ggml_trunc(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_SGN: + tensor_clone = ggml_sgn(ggml_ctx, src_clone[0]); + break; + default: + std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; + GGML_ABORT("fatal error"); + } + } else if (tensor->op == GGML_OP_GLU) { + if (src_clone[1] == nullptr) { + tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]); + } else { + tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]); + } + ggml_set_op_params_i32(tensor_clone, 2, ggml_get_op_params_i32(tensor, 2)); + ggml_set_op_params_i32(tensor_clone, 3, ggml_get_op_params_i32(tensor, 3)); + } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { + if (tensor->src[1] == nullptr) { + tensor_clone = ggml_dup(ggml_ctx, src_clone[0]); + tensor_clone->type = tensor->type; + } else { + tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]); + } + } else if (tensor->op == GGML_OP_CONT) { + tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + } else if (tensor->op == GGML_OP_RESHAPE) { + tensor_clone = ggml_reshape_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + } else if (tensor->op == GGML_OP_VIEW) { + tensor_clone = ggml_view_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]); + } else if (tensor->op == GGML_OP_PERMUTE) { + int32_t * params = (int32_t *)tensor->op_params; + tensor_clone = ggml_permute(ggml_ctx, src_clone[0], params[0], params[1], params[2], params[3]); + } else if (tensor->op == GGML_OP_TRANSPOSE) { + tensor_clone = ggml_transpose(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_GET_ROWS) { + tensor_clone = ggml_get_rows(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_ARGSORT) { + tensor_clone = ggml_argsort(ggml_ctx, src_clone[0], (ggml_sort_order) *(int *)tensor->op_params); + } else if (tensor->op == GGML_OP_TOP_K) { + tensor_clone = ggml_top_k(ggml_ctx, src_clone[0], tensor->ne[0]); + } else if (tensor->op == GGML_OP_SUM) { + tensor_clone = ggml_sum(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_SUM_ROWS) { + tensor_clone = ggml_sum_rows(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_CUMSUM) { + tensor_clone = ggml_cumsum(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_MEAN) { + tensor_clone = ggml_mean(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_ARGMAX) { + tensor_clone = ggml_argmax(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_COUNT_EQUAL) { + tensor_clone = ggml_count_equal(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_SOLVE_TRI) { + tensor_clone = ggml_solve_tri(ggml_ctx, src_clone[0], src_clone[1], true, true, false); + } else if (tensor->op == GGML_OP_IM2COL) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t p0 = tensor->op_params[2]; + const int32_t p1 = tensor->op_params[3]; + const int32_t d0 = tensor->op_params[4]; + const int32_t d1 = tensor->op_params[5]; + + const bool is_2D = tensor->op_params[6] == 1; + tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1, is_2D, tensor->type); + } else if (tensor->op == GGML_OP_IM2COL_3D) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t s2 = tensor->op_params[2]; + const int32_t p0 = tensor->op_params[3]; + const int32_t p1 = tensor->op_params[4]; + const int32_t p2 = tensor->op_params[5]; + const int32_t d0 = tensor->op_params[6]; + const int32_t d1 = tensor->op_params[7]; + const int32_t d2 = tensor->op_params[8]; + const int32_t IC = tensor->op_params[9]; + + tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type); + } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) { + const int32_t dim = tensor->op_params[0]; + const int32_t max_period = tensor->op_params[1]; + tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period); + } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){ + const int32_t s0 = tensor->op_params[0]; + const int32_t p0 = tensor->op_params[1]; + const int32_t d0 = tensor->op_params[2]; + tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0); + } else if (tensor->op == GGML_OP_COL2IM_1D) { + const int32_t stride = tensor->op_params[0]; + const int32_t oc = tensor->op_params[1]; + const int32_t p0 = tensor->op_params[2]; + tensor_clone = ggml_col2im_1d(ggml_ctx, src_clone[0], stride, oc, p0); + } else if (tensor->op == GGML_OP_POOL_2D) { + enum ggml_op_pool op = static_cast(tensor->op_params[0]); + const int32_t k0 = tensor->op_params[1]; + const int32_t k1 = tensor->op_params[2]; + const int32_t s0 = tensor->op_params[3]; + const int32_t s1 = tensor->op_params[4]; + const int32_t p0 = tensor->op_params[5]; + const int32_t p1 = tensor->op_params[6]; + + tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1); + } else if (tensor->op == GGML_OP_CONV_2D) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t p0 = tensor->op_params[2]; + const int32_t p1 = tensor->op_params[3]; + const int32_t d0 = tensor->op_params[4]; + const int32_t d1 = tensor->op_params[5]; + tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1); + } else if (tensor->op == GGML_OP_CONV_2D_DW) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t p0 = tensor->op_params[2]; + const int32_t p1 = tensor->op_params[3]; + const int32_t d0 = tensor->op_params[4]; + const int32_t d1 = tensor->op_params[5]; + tensor_clone = ggml_conv_2d_dw_direct(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1); + } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_2D) { + const int32_t s = tensor->op_params[0]; + tensor_clone = ggml_conv_transpose_2d_p0(ggml_ctx, src_clone[0], src_clone[1], s); + } else if (tensor->op == GGML_OP_LEAKY_RELU) { + const float * op_params = (const float *)tensor->op_params; + tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false); + } else if (tensor->op == GGML_OP_RWKV_WKV6) { + tensor_clone = ggml_rwkv_wkv6(ggml_ctx, src_clone[0], src_clone[1], + src_clone[2], src_clone[3], src_clone[4], src_clone[5]); + } else if (tensor->op == GGML_OP_RWKV_WKV7) { + tensor_clone = ggml_rwkv_wkv7(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], + src_clone[4], src_clone[5], src_clone[6]); + } else if (tensor->op == GGML_OP_GATED_DELTA_NET) { + tensor_clone = ggml_gated_delta_net(ggml_ctx, src_clone[0], src_clone[1], + src_clone[2], src_clone[3], src_clone[4], src_clone[5], + ggml_get_op_params_i32(tensor, 0)); + } else if (tensor->op == GGML_OP_OPT_STEP_ADAMW) { + src_clone[0]->flags = tensor->src[0]->flags; + tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1], + src_clone[2], src_clone[3], src_clone[4]); + } else if (tensor->op == GGML_OP_OPT_STEP_SGD) { + src_clone[0]->flags = tensor->src[0]->flags; + tensor_clone = ggml_opt_step_sgd(ggml_ctx, src_clone[0], src_clone[1], + src_clone[2]); + } else if (tensor->op == GGML_OP_ADD_ID) { + tensor_clone = ggml_add_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]); + } else if (tensor->op == GGML_OP_SSM_SCAN) { + tensor_clone = ggml_ssm_scan(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], + src_clone[3], src_clone[4], src_clone[5], src_clone[6]); + } else if (tensor->op == GGML_OP_SSM_CONV) { + tensor_clone = ggml_ssm_conv(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_ROLL) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t s2 = tensor->op_params[2]; + const int32_t s3 = tensor->op_params[3]; + tensor_clone = ggml_roll(ggml_ctx, src_clone[0], s0, s1, s2, s3); + } + else { + std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; + GGML_ABORT("fatal error"); + } + cloned_tensors[tensor] = tensor_clone; + } + + ggml_cgraph * cgraph_cpu = ggml_new_graph(ggml_ctx); + ggml_build_forward_expand(cgraph_cpu, tensor_clone); + + ggml_graph_compute_with_ctx(ggml_ctx, cgraph_cpu, 8); + + if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { + ggml_vk_print_tensor(tensor_clone, "tensor_clone"); + } + + comp_size = ggml_nbytes(tensor_clone); + + comp_result = malloc(comp_size); + memcpy(comp_result, tensor_clone->data, comp_size); + memcpy(comp_nb, tensor_clone->nb, sizeof(size_t) * GGML_MAX_DIMS); + + for (auto m : cloned_mallocs) { + free(m); + } + + ggml_free(ggml_ctx); + + VK_LOG_DEBUG("END ggml_vk_check_results_0(" << tensor->name << ")"); +} + +static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) { + ggml_tensor * tensor = cgraph->nodes[tensor_idx + ctx->num_additional_fused_ops]; + if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) { + return; + } + + if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { + return; + } + + VK_LOG_DEBUG("ggml_vk_check_results_1(" << tensor->name << ")"); + + ggml_tensor * src0 = tensor->src[0]; + ggml_tensor * src1 = tensor->src[1]; + ggml_tensor * src2 = tensor->src[2]; + ggml_tensor * src3 = tensor->src[3]; + + void * tensor_data = tensor->data; + + if (ggml_backend_buffer_is_vk(tensor->buffer)) { + size_t tensor_size = ggml_nbytes(tensor); + tensor_data = malloc(tensor_size); + + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; + + vk_buffer& buffer_gpu = buf_ctx->dev_buffer; + uint64_t offset = vk_tensor_offset(tensor) + tensor->view_offs; + if (offset + tensor_size >= buffer_gpu->size) { + tensor_size = buffer_gpu->size - offset; + } + + ggml_vk_buffer_read(buffer_gpu, offset, tensor_data, tensor_size); + } + + float first_error_result = -1.0f; + float first_error_correct = -1.0f; + std::array first_error = { -1, -1, -1, -1 }; + double avg_err = 0.0; + size_t counter = 0; + + for (int i3 = 0; i3 < tensor->ne[3]; i3++) { + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + const bool buffer_size_fit = i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0] < comp_size; + float correct = 0.0f; + float result = 0.0f; + + if (buffer_size_fit) { + if (tensor->type == GGML_TYPE_F32) { + correct = *(float *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]); + result = *(float *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]); + } else if (tensor->type == GGML_TYPE_F16) { + correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0])); + result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0])); + } else if (tensor->type == GGML_TYPE_BF16) { + correct = ggml_bf16_to_fp32(*(ggml_bf16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0])); + result = ggml_bf16_to_fp32(*(ggml_bf16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0])); + } else if (tensor->type == GGML_TYPE_I32) { + correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]); + result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]); + } else if (tensor->type == GGML_TYPE_I64) { + correct = *(int64_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]); + result = *(int64_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]); + } else { + std::cerr << "Results check not implemented for type " << ggml_type_name(tensor->type) << std::endl; + } + } else { + std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl; + GGML_ABORT("fatal error"); + } + + if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) { + std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl; + std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; + if (src0 != nullptr) { + std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; + } + if (src1 != nullptr) { + std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; + } + if (src2 != nullptr) { + std::cerr << "src2=" << src2 << " src2->name=" << src2->name << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl; + } + if (src3 != nullptr) { + std::cerr << "src3=" << src3 << " src3->name=" << src3->name << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl; + } + std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; + std::cerr << std::endl << "Result:" << std::endl; + ggml_vk_print_tensor_area(tensor, tensor_data, i0, i1, i2, i3); + std::cerr << std::endl << "Correct:" << std::endl; + ggml_vk_print_tensor_area(tensor, comp_result, i0, i1, i2, i3); + std::cerr << std::endl; + std::vector done; + ggml_vk_print_graph_origin(tensor, done); + GGML_ABORT("fatal error"); + } + const double denom = std::fabs(correct) > 1.0f ? (std::fabs(correct) > 1e-8 ? std::fabs(correct) : 1e-8) : 1.0f; + if (first_error[0] == -1 && std::fabs(correct - result) / denom > 0.5) { + first_error[0] = i0; + first_error[1] = i1; + first_error[2] = i2; + first_error[3] = i3; + first_error_result = result; + first_error_correct = correct; + } + + // Special case, value is infinite, avoid NaN result in avg_err + // NaN also appears in results, if both are nan error is 0 + if (!std::isinf(correct) && !std::isinf(result) && !std::isnan(correct) && !std::isnan(result)) { + avg_err += std::fabs(correct - result) / denom; + } + counter++; + } + } + } + } + + avg_err /= counter; + + if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { + std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl; + std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; + if (src0 != nullptr) { + std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; + } + if (src1 != nullptr) { + std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; + } + if (src2 != nullptr) { + std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl; + } + if (src3 != nullptr) { + std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl; + } + std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; + std::cerr << std::endl << "Result:" << std::endl; + ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0); + std::cerr << std::endl << "Correct:" << std::endl; + ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 0, 0); + std::cerr << std::endl; + std::vector done; + ggml_vk_print_graph_origin(tensor, done); + } + + if (avg_err > 0.01 || std::isnan(avg_err)) { + std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl; + std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; + if (src0 != nullptr) { + std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; + } + if (src1 != nullptr) { + std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; + } + if (src2 != nullptr) { + std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl; + } + if (src3 != nullptr) { + std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl; + } + std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; + std::cerr << std::endl << "Result:" << std::endl; + ggml_vk_print_tensor_area(tensor, tensor_data, first_error[0], first_error[1], first_error[2], first_error[3]); + std::cerr << std::endl << "Correct:" << std::endl; + ggml_vk_print_tensor_area(tensor, comp_result, first_error[0], first_error[1], first_error[2], first_error[3]); + std::cerr << std::endl; + std::vector done; + ggml_vk_print_graph_origin(tensor, done); + GGML_ABORT("fatal error"); + } else { + std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl; + } + + free(comp_result); + comp_result = nullptr; + comp_size = 0; + + if (ggml_backend_buffer_is_vk(tensor->buffer)) { + free(tensor_data); + } + + VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")"); +} +#endif + diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-flash-attn.cpp b/ggml/src/ggml-vulkan/ggml-vulkan-flash-attn.cpp new file mode 100644 index 0000000000..8705b00403 --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-flash-attn.cpp @@ -0,0 +1,293 @@ +#include "ggml-vulkan-common.h" + +void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst) { + VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3]; + std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3]; + std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3]; + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + if (sinks) { + std::cerr << "), (" << sinks << ", name=" << sinks->name << ", type=" << sinks->type << ", ne0=" << sinks->ne[0] << ", ne1=" << sinks->ne[1] << ", ne2=" << sinks->ne[2] << ", ne3=" << sinks->ne[3] << ", nb0=" << sinks->nb[0] << ", nb1=" << sinks->nb[1] << ", nb2=" << sinks->nb[2] << ", nb3=" << sinks->nb[3]; + } + std::cerr << "))"); + + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + + const uint32_t nem0 = mask ? mask->ne[0] : 0; + const uint32_t nem1 = mask ? mask->ne[1] : 0; + const uint32_t nem2 = mask ? mask->ne[2] : 0; + const uint32_t nem3 = mask ? mask->ne[3] : 0; + + const uint32_t HSK = nek0; + const uint32_t HSV = nev0; + uint32_t N = neq1; + const uint32_t KV = nek1; + + GGML_ASSERT(ne0 == HSV); + GGML_ASSERT(ne2 == N); + + // input tensor rows must be contiguous + GGML_ASSERT(nbq0 == ggml_type_size(q->type)); + GGML_ASSERT(nbk0 == ggml_type_size(k->type)); + GGML_ASSERT(nbv0 == ggml_type_size(v->type)); + + GGML_ASSERT(neq0 == HSK); + + GGML_ASSERT(neq1 == N); + + GGML_ASSERT(nev1 == nek1); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + assert(dst->type == GGML_TYPE_F32); + assert(q->type == GGML_TYPE_F32); + uint32_t gqa_ratio = 1; + uint32_t qk_ratio = neq2 / nek2; + uint32_t workgroups_x = (uint32_t)neq1; + uint32_t workgroups_y = (uint32_t)neq2; + uint32_t workgroups_z = (uint32_t)neq3; + + const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32 || k->type == GGML_TYPE_BF16; + + // For scalar/coopmat1 FA, we can use the "large" size to accommodate qga. + // For coopmat2 FA, we always use the small size (which is still pretty large for gqa). + vk_fa_tuning_params tuning_params = get_fa_tuning_params(ctx->device, HSK, HSV, 512, KV, k->type, v->type, f32acc); + const uint32_t max_gqa = std::min(tuning_params.block_rows, 32u); + + if (N <= 8 && qk_ratio > 1 && qk_ratio <= max_gqa && + qk_ratio * nek2 == neq2 && nek2 == nev2 && nem2 <= 1) { + // grouped query attention - make the N dimension equal to gqa_ratio, reduce + // workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1 + // and change addressing calculations to index Q's dimension 2. + gqa_ratio = qk_ratio; + N = gqa_ratio; + workgroups_y /= gqa_ratio; + } + + tuning_params = get_fa_tuning_params(ctx->device, HSK, HSV, N, KV, k->type, v->type, f32acc); + + const uint32_t q_stride = (uint32_t)(nbq1 / ggml_type_size(q->type)); + uint32_t k_stride = (uint32_t)(nbk1 / ggml_type_size(k->type)); + uint32_t v_stride = (uint32_t)(nbv1 / ggml_type_size(v->type)); + + // For F32, the shader treats it as a block of size 4 (for vec4 loads) + if (k->type == GGML_TYPE_F32) { + k_stride /= 4; + } + if (v->type == GGML_TYPE_F32) { + v_stride /= 4; + } + + const uint32_t alignment = tuning_params.block_cols; + bool aligned = (KV % alignment) == 0 && + // the "aligned" shader variant will forcibly align strides, for performance + (q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0; + + // Need to use the coopmat2 variant that clamps loads when HSK/HSV aren't sufficiently aligned. + if (((HSK | HSV) % 16) != 0 && tuning_params.path == FA_COOPMAT2) { + aligned = false; + } + + float scale = 1.0f; + float max_bias = 0.0f; + float logit_softcap = 0.0f; + + memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float)); + memcpy(&logit_softcap, (const float *) dst->op_params + 2, sizeof(float)); + + if (logit_softcap != 0) { + scale /= logit_softcap; + } + + // Only use mask opt when the mask is fairly large. This hasn't been tuned extensively. + bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768 && nem0 >= tuning_params.block_cols * 16; + vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(ctx->device, tuning_params, HSK, HSV, aligned, f32acc, + mask != nullptr, use_mask_opt, logit_softcap != 0, k->type, v->type); + + vk_pipeline pipeline = nullptr; + + { + std::lock_guard guard(ctx->device->compile_mutex); + auto &pipelines = ctx->device->pipeline_flash_attn_f32_f16; + auto it = pipelines.find(fa_pipeline_state); + if (it != pipelines.end()) { + pipeline = it->second; + } else { + pipelines[fa_pipeline_state] = pipeline = std::make_shared(); + } + } + + assert(pipeline); + // Compile early to initialize wg_denoms. + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + + uint32_t split_kv = KV; + uint32_t split_k = 1; + + // Intel Alchemist prefers more workgroups + const uint32_t shader_core_count_multiplier = (ctx->device->vendor_id == VK_VENDOR_ID_INTEL && ctx->device->architecture != INTEL_XE2) ? 2 : 1; + + // Use a placeholder core count if one isn't available. split_k is a big help for perf. + const uint32_t shader_core_count = ctx->device->shader_core_count ? ctx->device->shader_core_count * shader_core_count_multiplier : 16; + + const uint32_t Br = fa_pipeline_state.Br; + const uint32_t Bc = fa_pipeline_state.Bc; + + GGML_ASSERT(Br == pipeline->wg_denoms[0]); + const uint32_t Tr = CEIL_DIV(N, Br); + + // Try to use split_k when KV is large enough to be worth the overhead. + if (gqa_ratio > 1 && workgroups_x <= Br) { + split_k = shader_core_count * 2 / (workgroups_x * workgroups_y * workgroups_z); + } else if (gqa_ratio <= 1) { + uint32_t total_wgs_no_split = Tr * workgroups_y * workgroups_z; + if (total_wgs_no_split < shader_core_count * 2) { + split_k = shader_core_count * 2 / total_wgs_no_split; + } + } + + if (split_k > 1) { + // Try to evenly split KV into split_k chunks, but it needs to be a multiple + // of "align", so recompute split_k based on that. + split_kv = ROUNDUP_POW2(std::max(1u, KV / split_k), alignment); + split_k = CEIL_DIV(KV, split_kv); + } + + // Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1) + // and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows. + // For matrices, the order is (inner to outer) [HSV, ne1, k, ne2, ne3]. + // For L/M, the order is (inner to outer) [ne1, k, ne2, ne3]. + const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne2 * ne3 : 0; + if (split_k_size > ctx->device->properties.limits.maxStorageBufferRange) { + GGML_ABORT("Requested preallocation size is too large"); + } + if (ctx->prealloc_size_split_k < split_k_size) { + ctx->prealloc_size_split_k = split_k_size; + ggml_vk_preallocate_buffers(ctx, subctx); + } + + const uint32_t mask_opt_num_dwords = CEIL_DIV(nem0, 16 * Bc); + const uint64_t mask_opt_size = sizeof(uint32_t) * mask_opt_num_dwords * CEIL_DIV(nem1, Br) * nem2 * nem3; + + vk_pipeline pipeline_fa_mask_opt = nullptr; + if (use_mask_opt) { + { + std::lock_guard guard(ctx->device->compile_mutex); + auto &pipelines = ctx->device->pipeline_fa_mask_opt; + auto it = pipelines.find({Br, Bc}); + if (it != pipelines.end()) { + pipeline_fa_mask_opt = it->second; + } else { + pipelines[{Br, Bc}] = pipeline_fa_mask_opt = std::make_shared(); + } + } + assert(pipeline_fa_mask_opt); + ggml_pipeline_request_descriptor_sets(ctx, pipeline_fa_mask_opt, 1); + + if (ctx->prealloc_size_y < mask_opt_size) { + ctx->prealloc_size_y = mask_opt_size; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if (ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + } + + const uint32_t n_head_kv = neq2; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + vk_subbuffer q_buf = ggml_vk_tensor_subbuffer(ctx, q); + vk_subbuffer k_buf = ggml_vk_tensor_subbuffer(ctx, k); + vk_subbuffer v_buf = ggml_vk_tensor_subbuffer(ctx, v); + vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); + vk_subbuffer mask_buf = mask ? ggml_vk_tensor_subbuffer(ctx, mask) : q_buf; + vk_subbuffer sinks_buf = sinks ? ggml_vk_tensor_subbuffer(ctx, sinks) : q_buf; + vk_subbuffer mask_opt_buf = use_mask_opt ? ggml_vk_subbuffer(ctx, ctx->prealloc_y, 0) : q_buf; + + uint32_t mask_n_head_log2 = ((sinks != nullptr) << 24) | n_head_log2; + + if (use_mask_opt) + { + const vk_op_flash_attn_mask_opt_push_constants opt_pc = { + nem0, + nem1, + nem2, + (uint32_t)(mask->nb[1] / sizeof(ggml_fp16_t)), + (uint32_t)(mask->nb[2] / sizeof(ggml_fp16_t)), + (uint32_t)(mask->nb[3] / sizeof(ggml_fp16_t)), + mask_opt_num_dwords, + mask_opt_num_dwords * CEIL_DIV(nem1, Br), + mask_opt_num_dwords * CEIL_DIV(nem1, Br) * nem2, + }; + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline_fa_mask_opt, + { mask_buf, mask_opt_buf }, opt_pc, + { mask_opt_num_dwords, CEIL_DIV(nem1, Br), nem2 * nem3 }); + ggml_vk_sync_buffers(ctx, subctx); + } + + const vk_flash_attn_push_constants pc = { N, KV, + (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, + (uint32_t)neq2, (uint32_t)neq3, + (uint32_t)nek2, (uint32_t)nek3, + (uint32_t)nev2, (uint32_t)nev3, + nem1, nem2, nem3, + q_stride, (uint32_t)nbq2, (uint32_t)nbq3, + k_stride, (uint32_t)nbk2, (uint32_t)nbk3, + v_stride, (uint32_t)nbv2, (uint32_t)nbv3, + scale, max_bias, logit_softcap, + mask_n_head_log2, m0, m1, + gqa_ratio, split_kv, split_k }; + + if (split_k > 1) { + ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1); + + if (ctx->prealloc_split_k_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + + // We reuse workgroups_x to mean the number of splits, so we need to + // cancel out the divide by wg_denoms[0]. + uint32_t dispatch_x; + if (gqa_ratio > 1) { + workgroups_x *= pipeline->wg_denoms[0]; + dispatch_x = split_k * workgroups_x; + } else { + dispatch_x = Tr * split_k * pipeline->wg_denoms[0]; + } + + vk_subbuffer split_k_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + {q_buf, k_buf, v_buf, mask_buf, sinks_buf, split_k_buf, mask_opt_buf}, + pc, { dispatch_x, workgroups_y, workgroups_z }); + + ggml_vk_sync_buffers(ctx, subctx); + const vk_op_flash_attn_split_k_reduce_push_constants pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, split_k, (sinks != nullptr) }; + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce, + {split_k_buf, sinks_buf, dst_buf}, + pc2, { (uint32_t)ne1, HSV, (uint32_t)(ne2 * ne3) }); + ctx->prealloc_split_k_need_sync = true; + } else { + if (gqa_ratio > 1) { + // When using gqa, we want one actual workgroup per batch, so cancel out wg_denoms + workgroups_x *= pipeline->wg_denoms[0]; + } + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + {q_buf, k_buf, v_buf, mask_buf, sinks_buf, dst_buf, mask_opt_buf}, + pc, { workgroups_x, workgroups_y, workgroups_z }); + } +} + diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-graph.cpp b/ggml/src/ggml-vulkan/ggml-vulkan-graph.cpp new file mode 100644 index 0000000000..5111b30435 --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-graph.cpp @@ -0,0 +1,1610 @@ +#include "ggml-vulkan-common.h" + +void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx) { +#if defined(GGML_VULKAN_RUN_TESTS) + const std::vector vals { + 512, 512, 128, + 128, 512, 512, + 4096, 512, 4096, + 11008, 512, 4096, + 4096, 512, 11008, + 32000, 512, 4096, + 8, 8, 8, + 100, 46, 576, + 623, 111, 128, + 100, 46, 558, + 512, 1, 256, + 128, 110, 622, + 511, 511, 127, + 511, 511, 7, + 511, 511, 17, + 49, 49, 128, + 128, 49, 49, + 4096, 49, 4096, + }; + const size_t num_it = 100; + + ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q4_0); + + ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q4_0, true); + ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q4_0, true); + ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q4_0, true); + + ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q8_0); + ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q8_0); + ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q8_0); + + ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q8_0, true); + ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q8_0, true); + ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q8_0, true); + + abort(); + + for (size_t i = 0; i < vals.size(); i += 3) { + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2); + std::cerr << '\n'; + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2); + std::cerr << '\n'; + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2); + std::cerr << '\n' << std::endl; + + if (vals[i + 2] % 32 == 0) { + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2, GGML_TYPE_Q4_0); + std::cerr << '\n'; + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2, GGML_TYPE_Q4_0); + std::cerr << '\n'; + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2, GGML_TYPE_Q4_0); + std::cerr << '\n' << std::endl; + } + + if (vals[i + 2] % 256 == 0) { + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2, GGML_TYPE_Q4_K); + std::cerr << '\n'; + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2, GGML_TYPE_Q4_K); + std::cerr << '\n'; + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2, GGML_TYPE_Q4_K); + std::cerr << '\n' << std::endl; + } + } + + GGML_ABORT("fatal error"); +#endif + + if (subctx) { + // Submit and wait for any pending work before reallocating the buffers + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, {}); + ctx->submit_pending = true; + ggml_vk_synchronize(ctx); + GGML_ASSERT(ctx->compute_ctx.expired()); + ggml_vk_ctx_begin(ctx->device, subctx); + ctx->compute_ctx = subctx; + } + + if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) { + VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")"); + // Resize buffer + if (ctx->prealloc_x != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_x); + } + ctx->prealloc_x = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_x); + } + if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) { + VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")"); + // Resize buffer + if (ctx->prealloc_y != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_y); + } + ctx->prealloc_y = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_y); + ctx->prealloc_y_last_pipeline_used = nullptr; + ctx->prealloc_y_last_tensor_used = nullptr; + ctx->prealloc_y_last_decode_vector_staging = false; + } + if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) { + VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")"); + // Resize buffer + if (ctx->prealloc_split_k != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_split_k); + } + ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_split_k); + } + if (ctx->prealloc_add_rms_partials == nullptr || (ctx->prealloc_size_add_rms_partials > 0 && ctx->prealloc_add_rms_partials->size < ctx->prealloc_size_add_rms_partials)) { + VK_LOG_MEMORY("ggml_vk_preallocate_buffers(add_partials_size: " << ctx->prealloc_add_rms_partials << ")"); + // Resize buffer + if (ctx->prealloc_add_rms_partials != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_add_rms_partials); + } + ctx->prealloc_add_rms_partials = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_add_rms_partials); + } +} + +bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool last_node, bool almost_ready, bool submit){ + ggml_tensor * node = cgraph->nodes[node_idx]; + if (ggml_is_empty(node) || ggml_op_is_empty(node->op) || !node->buffer) { + return false; + } + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + return false; + } + + VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")"); + ctx->semaphore_idx = 0; + + ggml_tensor * src0 = node->src[0]; + ggml_tensor * src1 = node->src[1]; + ggml_tensor * src2 = node->src[2]; + ggml_tensor * src3 = node->src[3]; + + if (node->op == GGML_OP_ADD) { + int next_node_idx = node_idx + 1 + ctx->num_additional_fused_ops; + if (next_node_idx < cgraph->n_nodes && + cgraph->nodes[next_node_idx]->op == GGML_OP_RMS_NORM && + cgraph->nodes[next_node_idx]->src[0] == cgraph->nodes[next_node_idx - 1] && + ggml_nrows(cgraph->nodes[next_node_idx]) == 1 && + ctx->device->add_rms_fusion) { + uint32_t size = ggml_vk_rms_partials_size(ctx, cgraph->nodes[node_idx]); + ctx->do_add_rms_partials_offset_calculation = true; + if (ctx->prealloc_size_add_rms_partials_offset + size <= ctx->prealloc_size_add_rms_partials) { + ctx->do_add_rms_partials = true; + } + } + } + + vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); + + { + // This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers + // to synchronize them. This handles most "normal" synchronization when computing the graph, and when + // there is no auxiliary memory use, it shouldn't be necessary to call ggml_vk_sync_buffers + // outside of this logic. When a node uses one of the prealloc buffers for something like + // dequantization or split_k, additional synchronization is needed between those passes. + bool need_sync = false; + + // Check whether "node" requires synchronization. The node requires synchronization if it + // overlaps in memory with another unsynchronized node and at least one of them is a write. + // Destination nodes are checked against both the written/read lists. Source nodes are only + // checked against the written list. Two nodes overlap in memory if they come from the same + // buffer and the tensor or view ranges overlap. + auto const &overlaps_unsynced = [&](const ggml_tensor *node, const std::vector &unsynced_nodes) -> bool { + if (unsynced_nodes.size() == 0) { + return false; + } + auto n_base = vk_tensor_offset(node) + node->view_offs; + auto n_size = ggml_nbytes(node); + ggml_backend_vk_buffer_context * a_buf_ctx = (ggml_backend_vk_buffer_context *)node->buffer->context; + vk_buffer a_buf = a_buf_ctx->dev_buffer; + for (auto &other : unsynced_nodes) { + ggml_backend_vk_buffer_context * o_buf_ctx = (ggml_backend_vk_buffer_context *)other->buffer->context; + vk_buffer o_buf = o_buf_ctx->dev_buffer; + if (a_buf == o_buf) { + auto o_base = vk_tensor_offset(other) + other->view_offs; + auto o_size = ggml_nbytes(other); + + if ((o_base <= n_base && n_base < o_base + o_size) || + (n_base <= o_base && o_base < n_base + n_size)) { + return true; + } + } + } + return false; + }; + + // For all fused ops, check if the destination node or any of the source + // nodes require synchronization. + for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1 && !need_sync; ++i) { + const ggml_tensor *cur_node = cgraph->nodes[node_idx + i]; + // If the node actually writes to memory, then check if it needs to sync + if (ctx->fused_ops_write_mask & (1 << i)) { + if (overlaps_unsynced(cur_node, ctx->unsynced_nodes_read) || overlaps_unsynced(cur_node, ctx->unsynced_nodes_written)) { + need_sync = true; + break; + } + } + for (uint32_t j = 0; j < GGML_MAX_SRC; ++j) { + if (!cur_node->src[j]) { + continue; + } + if (overlaps_unsynced(cur_node->src[j], ctx->unsynced_nodes_written)) { + need_sync = true; + break; + } + } + } + + if (need_sync) { + if (vk_enable_sync_logger) { + std::cerr << "sync" << std::endl; + } + ctx->unsynced_nodes_written.clear(); + ctx->unsynced_nodes_read.clear(); + ggml_vk_sync_buffers(ctx, compute_ctx); + + if (vk_perf_logger_enabled && vk_perf_logger_concurrent) { + ctx->query_node_idx[ctx->query_idx] = node_idx; + compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + ggml_vk_sync_buffers(ctx, compute_ctx); + } + } + // Add all fused nodes to the unsynchronized lists. + for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { + const ggml_tensor *cur_node = cgraph->nodes[node_idx + i]; + // Multiple outputs could be written, e.g. in topk_moe. Add them all to the list. + if (ctx->fused_ops_write_mask & (1 << i)) { + ctx->unsynced_nodes_written.push_back(cur_node); + } + for (uint32_t j = 0; j < GGML_MAX_SRC; ++j) { + if (!cur_node->src[j]) { + continue; + } + ctx->unsynced_nodes_read.push_back(cur_node->src[j]); + } + } + } + if (vk_enable_sync_logger) { + for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { + auto *n = cgraph->nodes[node_idx + i]; + std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name; + if (n->op == GGML_OP_GLU) { + std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " "; + } + if (n->op == GGML_OP_ROPE) { + const int mode = ((const int32_t *) n->op_params)[2]; + std::cerr << " rope mode: " << mode; + } + std::cerr << std::endl; + } + } + + switch (node->op) { + case GGML_OP_REPEAT: + ggml_vk_repeat(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_REPEAT_BACK: + ggml_vk_repeat_back(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_ACC: + case GGML_OP_SET: + ggml_vk_acc(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_GET_ROWS: + ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_ADD: + if (ctx->num_additional_fused_ops) { + ggml_vk_multi_add(ctx, compute_ctx, cgraph, node_idx); + } else { + ggml_vk_add(ctx, compute_ctx, src0, src1, node); + } + break; + case GGML_OP_SUB: + ggml_vk_sub(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_MUL: + if (ctx->num_additional_fused_ops) { + ggml_vk_snake_dispatch_fused(ctx, compute_ctx, cgraph, node_idx); + } else { + ggml_vk_mul(ctx, compute_ctx, src0, src1, node); + } + + break; + case GGML_OP_DIV: + ggml_vk_div(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_ADD_ID: + ggml_vk_add_id(ctx, compute_ctx, src0, src1, src2, node); + + break; + case GGML_OP_CONCAT: + ggml_vk_concat(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_UPSCALE: + ggml_vk_upscale(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_ADD1: + ggml_vk_add1(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_ARANGE: + ggml_vk_arange(ctx, compute_ctx, node); + + break; + case GGML_OP_FILL: + ggml_vk_fill(ctx, compute_ctx, node); + + break; + case GGML_OP_SCALE: + ggml_vk_scale(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_SQR: + ggml_vk_sqr(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_SQRT: + ggml_vk_sqrt(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_SIN: + ggml_vk_sin(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_COS: + ggml_vk_cos(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_LOG: + ggml_vk_log(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_TRI: + ggml_vk_tri(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_DIAG: + ggml_vk_diag(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_CLAMP: + ggml_vk_clamp(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_PAD: + ggml_vk_pad(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_ROLL: + ggml_vk_roll(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_CPY: + case GGML_OP_CONT: + case GGML_OP_DUP: + ggml_vk_cpy(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_SET_ROWS: + ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_SILU_BACK: + ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_NORM: + ggml_vk_norm(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_GROUP_NORM: + ggml_vk_group_norm(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_RMS_NORM: + ggml_vk_rms_norm(ctx, compute_ctx, cgraph, node_idx, (float *)node->op_params); + break; + case GGML_OP_RMS_NORM_BACK: + ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_L2_NORM: + ggml_vk_l2_norm(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_UNARY: + if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) { + ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx); + break; + } + + switch (ggml_get_unary_op(node)) { + case GGML_UNARY_OP_ELU: + case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_EXPM1: + case GGML_UNARY_OP_SILU: + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_ERF: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_TANH: + case GGML_UNARY_OP_SIGMOID: + case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_SOFTPLUS: + case GGML_UNARY_OP_STEP: + case GGML_UNARY_OP_ROUND: + case GGML_UNARY_OP_CEIL: + case GGML_UNARY_OP_FLOOR: + case GGML_UNARY_OP_TRUNC: + case GGML_UNARY_OP_SGN: + ggml_vk_unary(ctx, compute_ctx, src0, node); + break; + case GGML_UNARY_OP_XIELU: + ggml_vk_xielu(ctx, compute_ctx, src0, node); + break; + default: + return false; + } + break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(node)) { + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_SWIGLU: + case GGML_GLU_OP_SWIGLU_OAI: + case GGML_GLU_OP_GEGLU_ERF: + case GGML_GLU_OP_GEGLU_QUICK: + ggml_vk_glu(ctx, compute_ctx, src0, src1, node); + break; + default: + return false; + } + break; + case GGML_OP_DIAG_MASK_INF: + ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_SOFT_MAX: + if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) { + ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx); + } else { + ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node); + } + + break; + case GGML_OP_SOFT_MAX_BACK: + ggml_vk_soft_max_back(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_ROPE: + ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, false); + + break; + case GGML_OP_ROPE_BACK: + ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, true); + + break; + case GGML_OP_ARGSORT: + if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) { + ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx); + } else { + ggml_vk_argsort(ctx, compute_ctx, src0, node); + } + + break; + case GGML_OP_TOP_K: + ggml_vk_topk(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_SUM: + ggml_vk_sum(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_SUM_ROWS: + ggml_vk_sum_rows(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_CUMSUM: + ggml_vk_cumsum(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_MEAN: + ggml_vk_mean(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_ARGMAX: + ggml_vk_argmax(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_COUNT_EQUAL: + ggml_vk_count_equal(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_SOLVE_TRI: + ggml_vk_solve_tri(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_IM2COL: + ggml_vk_im2col(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_IM2COL_3D: + ggml_vk_im2col_3d(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_TIMESTEP_EMBEDDING: + ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_COL2IM_1D: + ggml_vk_col2im_1d(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_CONV_TRANSPOSE_1D: + ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_POOL_2D: + ggml_vk_pool_2d(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_CONV_2D: + case GGML_OP_CONV_TRANSPOSE_2D: + ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_CONV_2D_DW: + ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_LEAKY_RELU: + ggml_vk_leaky_relu(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_MUL_MAT: + ggml_vk_mul_mat(ctx, compute_ctx, cgraph, node_idx); + + break; + case GGML_OP_MUL_MAT_ID: + ggml_vk_mul_mat_id(ctx, compute_ctx, cgraph, node_idx); + + break; + + case GGML_OP_FLASH_ATTN_EXT: + ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node->src[4], node); + + break; + + case GGML_OP_RWKV_WKV6: + ggml_vk_rwkv_wkv6(ctx, compute_ctx, node); + + break; + + case GGML_OP_RWKV_WKV7: + ggml_vk_rwkv_wkv7(ctx, compute_ctx, node); + + break; + + case GGML_OP_GATED_DELTA_NET: + ggml_vk_gated_delta_net(ctx, compute_ctx, node); + + break; + + case GGML_OP_SSM_SCAN: + ggml_vk_ssm_scan(ctx, compute_ctx, node); + + break; + + case GGML_OP_SSM_CONV: + ggml_vk_ssm_conv(ctx, compute_ctx, cgraph, node_idx); + + break; + + case GGML_OP_OPT_STEP_ADAMW: + ggml_vk_opt_step_adamw(ctx, compute_ctx, node); + + break; + + case GGML_OP_OPT_STEP_SGD: + ggml_vk_opt_step_sgd(ctx, compute_ctx, src0, src1, src2, node); + + break; + default: + return false; + } + + ctx->tensor_ctxs[node_idx] = compute_ctx; + +#if defined(GGML_VULKAN_CHECK_RESULTS) + // Force context reset on each node so that each tensor ends up in its own context + // and can be run and compared to its CPU equivalent separately + last_node = true; +#endif + + if (submit || last_node) { + ggml_vk_ctx_end(compute_ctx); + + // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward + if (last_node) { + compute_ctx->exit_tensor_idx = node_idx_begin; + } + else { + compute_ctx->exit_tensor_idx = -1; + } + + ctx->compute_ctx.reset(); + + ggml_vk_compute_forward(ctx, cgraph, node_begin, node_idx_begin, almost_ready); + } + return true; +} + +void ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, ggml_tensor * tensor, int tensor_idx, bool almost_ready) { + GGML_UNUSED(cgraph); + GGML_UNUSED(tensor); + + VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")"); + + vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock(); + + // Only run if ctx hasn't been submitted yet + if (!subctx->seqs.empty()) { +#ifdef GGML_VULKAN_CHECK_RESULTS + ggml_vk_check_results_0(ctx, cgraph, tensor_idx); +#endif + + // Do staging buffer copies + for (auto& cpy : subctx->in_memcpys) { + memcpy(cpy.dst, cpy.src, cpy.n); + } + + for (auto& mset : subctx->memsets) { + memset(mset.dst, mset.val, mset.n); + } + + if (almost_ready && !ctx->almost_ready_fence_pending) { + ggml_vk_submit(subctx, ctx->almost_ready_fence); + ctx->almost_ready_fence_pending = true; + } else { + ggml_vk_submit(subctx, {}); + } + ctx->submit_pending = true; + +#ifdef GGML_VULKAN_CHECK_RESULTS + ggml_vk_synchronize(ctx); + ggml_vk_check_results_1(ctx, cgraph, tensor_idx); +#endif + } + + if (tensor_idx == subctx->exit_tensor_idx) { + // Do staging buffer copies + for (auto& cpy : subctx->out_memcpys) { + memcpy(cpy.dst, cpy.src, cpy.n); + } + subctx->in_memcpys.clear(); + subctx->out_memcpys.clear(); + subctx->memsets.clear(); + } +} + +void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { + VK_LOG_DEBUG("ggml_vk_graph_cleanup()"); + ctx->prealloc_y_last_pipeline_used = {}; + ctx->prealloc_y_last_tensor_used = nullptr; + ctx->prealloc_y_last_decode_vector_staging = false; + + ctx->unsynced_nodes_written.clear(); + ctx->unsynced_nodes_read.clear(); + ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false; + + ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool); + if (ctx->device->async_use_transfer_queue) { + ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool); + } + + for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) { + ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s }); + } + ctx->gc.semaphores.clear(); + + for (size_t i = 0; i < ctx->gc.tl_semaphores.size(); i++) { + ctx->device->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s }); + } + ctx->gc.tl_semaphores.clear(); + ctx->semaphore_idx = 0; + + ctx->event_idx = 0; + + for (auto& event : ctx->gc.events) { + ctx->device->device.resetEvent(event); + } + + ctx->tensor_ctxs.clear(); + ctx->gc.contexts.clear(); + ctx->pipeline_descriptor_set_requirements = 0; + ctx->descriptor_set_idx = 0; +} + +void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { + VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->name << ")"); + // discard any unsubmitted command buffers + ctx->compute_ctx.reset(); + // wait for any pending command buffers to finish + ggml_vk_synchronize(ctx); + + ggml_vk_graph_cleanup(ctx); + + ggml_vk_destroy_buffer(ctx->prealloc_x); + ggml_vk_destroy_buffer(ctx->prealloc_y); + ggml_vk_destroy_buffer(ctx->prealloc_split_k); + ggml_vk_destroy_buffer(ctx->prealloc_add_rms_partials); + ggml_vk_destroy_buffer(ctx->sync_staging); + + ctx->prealloc_y_last_pipeline_used = nullptr; + ctx->prealloc_y_last_tensor_used = nullptr; + ctx->prealloc_y_last_decode_vector_staging = false; + + ctx->prealloc_size_x = 0; + ctx->prealloc_size_y = 0; + ctx->prealloc_size_split_k = 0; + + for (auto& event : ctx->gc.events) { + ctx->device->device.destroyEvent(event); + } + ctx->gc.events.clear(); + + ctx->device->device.destroyFence(ctx->fence); + ctx->device->device.destroyFence(ctx->almost_ready_fence); + + for (auto& pool : ctx->descriptor_pools) { + ctx->device->device.destroyDescriptorPool(pool); + } + ctx->descriptor_pools.clear(); + ctx->descriptor_sets.clear(); + + ctx->compute_cmd_pool.destroy(ctx->device->device); + if (ctx->device->async_use_transfer_queue) { + ctx->device->device.destroySemaphore(ctx->transfer_semaphore.s); + + ctx->transfer_cmd_pool.destroy(ctx->device->device); + } + if (vk_perf_logger_enabled) { + ctx->perf_logger->print_timings(true); + } +} + +void ggml_vk_synchronize(ggml_backend_vk_context * ctx) { + VK_LOG_DEBUG("ggml_vk_synchronize()"); + + bool do_transfer = !ctx->compute_ctx.expired(); + + if (ggml_vk_submit_transfer_ctx(ctx)) { + ctx->submit_pending = true; + } + + vk_context compute_ctx; + vk_command_buffer* cmd_buf = nullptr; + if (do_transfer) { + compute_ctx = ctx->compute_ctx.lock(); + if (compute_ctx->s) { + cmd_buf = compute_ctx->s->buffer; + } + + ggml_vk_ctx_end(compute_ctx); + + for (auto& cpy : compute_ctx->in_memcpys) { + memcpy(cpy.dst, cpy.src, cpy.n); + } + + ggml_vk_submit(compute_ctx, {}); + ctx->submit_pending = true; + } + + if (ctx->submit_pending) { + if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) { + vk::TimelineSemaphoreSubmitInfo tl_info{ + 1, &ctx->transfer_semaphore.value, + 0, nullptr, + }; + vk::PipelineStageFlags stage = ctx->device->transfer_queue.stage_flags; + vk::SubmitInfo si{ + 1, &ctx->transfer_semaphore.s, &stage, + 0, nullptr, + 0, nullptr, + }; + si.setPNext(&tl_info); + std::lock_guard guard(queue_mutex); + ctx->device->compute_queue.queue.submit({ si }, ctx->fence); + ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value; + } else { + std::lock_guard guard(queue_mutex); + ctx->device->compute_queue.queue.submit({}, ctx->fence); + } + ggml_vk_wait_for_fence(ctx); + ctx->submit_pending = false; + if (cmd_buf) { + cmd_buf->in_use = false; + cmd_buf->buf.reset(); + } + } + + if (do_transfer) { + for (auto& cpy : compute_ctx->out_memcpys) { + memcpy(cpy.dst, cpy.src, cpy.n); + } + ctx->compute_ctx.reset(); + } +} + +bool ggml_vk_is_empty(ggml_tensor * node) { + return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; +} + +bool ggml_vk_can_fuse(const ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list ops) { + if (!ggml_can_fuse(cgraph, node_idx, ops)) { + return false; + } + + if (ops.size() == 2 && ops.begin()[0] == GGML_OP_RMS_NORM && ops.begin()[1] == GGML_OP_MUL) { + // additional constraints specific to this fusion + const ggml_tensor *rms_norm = cgraph->nodes[node_idx]; + const ggml_tensor *mul = cgraph->nodes[node_idx + 1]; + + GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(rms_norm->type == GGML_TYPE_F32); + // rms_norm only supports f32 + if (mul->src[0]->type != GGML_TYPE_F32 || + mul->src[1]->type != GGML_TYPE_F32 || + mul->type != GGML_TYPE_F32) { + return false; + } + // if rms_norm is the B operand, then we don't handle broadcast + if (rms_norm == mul->src[1] && + !ggml_are_same_shape(mul->src[0], rms_norm)) { + return false; + } + // rms_norm shader assumes contiguous rows + if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) { + return false; + } + } + auto const &mm_add_ok = [&](const ggml_tensor *mul, const ggml_tensor *add) { + const ggml_tensor *bias = add->src[0] == mul ? add->src[1] : add->src[0]; + + // mat-vec only + if (ggml_nrows(mul) != 1) { + return false; + } + // shaders assume the types match + if (mul->type != bias->type) { + return false; + } + // shaders reuse the D shape for bias + if (!ggml_are_same_shape(mul, bias) || + !ggml_are_same_stride(mul, bias)) { + return false; + } + // unaligned bias isn't handled + if (get_misalign_bytes(ctx, bias) != 0) { + return false; + } + return true; + }; + + if ((ops.size() == 2 || ops.size() == 3) && ops.begin()[0] == GGML_OP_MUL_MAT && ops.begin()[1] == GGML_OP_ADD) { + // additional constraints specific to this fusion + const ggml_tensor *mul = cgraph->nodes[node_idx]; + const ggml_tensor *add = cgraph->nodes[node_idx + 1]; + + if (!mm_add_ok(mul, add)) { + return false; + } + if (ops.size() == 3) { + if (ops.begin()[2] != GGML_OP_ADD) { + return false; + } + if (!mm_add_ok(add, cgraph->nodes[node_idx + 2])) { + return false; + } + } + } + + auto const &mmid_mul_ok = [&](const ggml_tensor *mmid, const ggml_tensor *mul) { + const ggml_tensor *scale = mul->src[1]; + + if (mmid != mul->src[0]) { + return false; + } + // mat-vec only + if (!ggml_vk_use_mul_mat_vec_id(cgraph, node_idx)) { + return false; + } + // shaders assume the types match + if (mmid->type != scale->type) { + return false; + } + // shaders assume the bias is contiguous + if (!ggml_is_contiguous(scale)) { + return false; + } + // unaligned bias isn't handled + if (get_misalign_bytes(ctx, scale) != 0) { + return false; + } + // shader only indexes by expert index + if (scale->ne[0] != 1 || + scale->ne[1] != mul->ne[1] || + scale->ne[2] != 1 || + scale->ne[3] != 1) { + return false; + } + return true; + }; + + if ((ops.size() == 2 || ops.size() == 3) && ops.begin()[0] == GGML_OP_MUL_MAT_ID && ops.begin()[1] == GGML_OP_ADD_ID) { + // additional constraints specific to this fusion + const ggml_tensor *mul = cgraph->nodes[node_idx]; + const ggml_tensor *add = cgraph->nodes[node_idx + 1]; + const ggml_tensor *bias = add->src[1]; + + if (mul != add->src[0]) { + return false; + } + // mat-vec only + if (!ggml_vk_use_mul_mat_vec_id(cgraph, node_idx)) { + return false; + } + // shaders assume the types match + if (mul->type != bias->type) { + return false; + } + // shaders assume the bias is contiguous + if (!ggml_is_contiguous(bias)) { + return false; + } + // the ID tensor must be the same for mul_mat_id and add_id + if (mul->src[2] != add->src[2]) { + return false; + } + // unaligned bias isn't handled + if (get_misalign_bytes(ctx, bias) != 0) { + return false; + } + + if (ops.size() == 3) { + if (ops.begin()[2] != GGML_OP_MUL) { + return false; + } + const ggml_tensor *mul = cgraph->nodes[node_idx + 2]; + return mmid_mul_ok(add, mul); + } + } + + if (ops.size() == 2 && ops.begin()[0] == GGML_OP_MUL_MAT_ID && ops.begin()[1] == GGML_OP_MUL) { + // additional constraints specific to this fusion + const ggml_tensor *mmid = cgraph->nodes[node_idx]; + const ggml_tensor *mul = cgraph->nodes[node_idx + 1]; + + if (!mmid_mul_ok(mmid, mul)) { + return false; + } + } + + return true; +} + +bool ggml_vk_can_fuse_ssm_conv(const ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, + int node_idx, int num_extra) { + const ggml_tensor * conv = cgraph->nodes[node_idx]; + if (conv->op != GGML_OP_SSM_CONV) { + return false; + } + + const ggml_tensor * silu = nullptr; + const ggml_tensor * bias = nullptr; + + if (num_extra == 1) { + if (!ggml_can_fuse(cgraph, node_idx, { GGML_OP_SSM_CONV, GGML_OP_UNARY })) { + return false; + } + silu = cgraph->nodes[node_idx + 1]; + } else if (num_extra == 2) { + if (!ggml_can_fuse(cgraph, node_idx, { GGML_OP_SSM_CONV, GGML_OP_ADD, GGML_OP_UNARY })) { + return false; + } + const ggml_tensor * add = cgraph->nodes[node_idx + 1]; + silu = cgraph->nodes[node_idx + 2]; + bias = (add->src[0] == conv) ? add->src[1] : add->src[0]; + + if (bias->type != GGML_TYPE_F32 || !ggml_is_contiguous(bias)) { + return false; + } + // bias must be channel-wise (one element per channel of the conv output) + if (ggml_nelements(bias) != conv->ne[0] || bias->ne[0] != conv->ne[0]) { + return false; + } + if (add->type != GGML_TYPE_F32) { + return false; + } + // The shader doesn't apply per-tensor offsets, so reject misaligned bias. + if (get_misalign_bytes(ctx, bias) != 0) { + return false; + } + } else { + return false; + } + + if (ggml_get_unary_op(silu) != GGML_UNARY_OP_SILU) { + return false; + } + if (conv->type != GGML_TYPE_F32 || silu->type != GGML_TYPE_F32) { + return false; + } + // The shader writes to the fused dst using its own strides, but the push constants don't + // carry a per-tensor offset, so the binding must be naturally aligned. + if (get_misalign_bytes(ctx, silu) != 0) { + return false; + } + return true; +} + +bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, + int node_idx, topk_moe_mode mode) { + + const ggml_tensor * softmax; + const ggml_tensor * weights; + const ggml_tensor * get_rows; + const ggml_tensor * argsort; + + switch (mode) { + case TOPK_MOE_EARLY_SOFTMAX_NORM: + softmax = cgraph->nodes[node_idx + 0]; + weights = cgraph->nodes[node_idx + 9]; + get_rows = cgraph->nodes[node_idx + 4]; + argsort = cgraph->nodes[node_idx + 2]; + break; + case TOPK_MOE_SIGMOID_NORM_BIAS: + softmax = cgraph->nodes[node_idx + 0]; // really sigmoid + weights = cgraph->nodes[node_idx + 10]; + get_rows = cgraph->nodes[node_idx + 5]; + argsort = cgraph->nodes[node_idx + 3]; + if (ggml_get_unary_op(softmax) != GGML_UNARY_OP_SIGMOID) { + return false; + } + // bias is expected to be 1D + if (ggml_nrows(cgraph->nodes[node_idx + 2]->src[1]) != 1 || + !ggml_is_contiguous(cgraph->nodes[node_idx + 2]->src[1])) { + return false; + } + // sigmoid fusion seems to generate infinities on moltenvk + if (ctx->device->driver_id == vk::DriverId::eMoltenvk) { + return false; + } + break; + case TOPK_MOE_EARLY_SOFTMAX: + softmax = cgraph->nodes[node_idx + 0]; + weights = cgraph->nodes[node_idx + 4]; + get_rows = cgraph->nodes[node_idx + 4]; + argsort = cgraph->nodes[node_idx + 2]; + break; + case TOPK_MOE_LATE_SOFTMAX: + softmax = cgraph->nodes[node_idx + 4]; + weights = cgraph->nodes[node_idx + 5]; + get_rows = cgraph->nodes[node_idx + 2]; + argsort = cgraph->nodes[node_idx + 0]; + break; + default: + return false; + } + + ggml_tensor * probs = get_rows->src[0]; + if (probs->op != GGML_OP_RESHAPE) { + return false; + } + probs = probs->src[0]; + ggml_tensor * selection_probs = argsort->src[0]; + + if (probs != selection_probs && mode != TOPK_MOE_SIGMOID_NORM_BIAS) { + return false; + } + + if (!ggml_is_contiguous(softmax->src[0]) || !ggml_is_contiguous(weights)) { + return false; + } + + if (softmax->op == GGML_OP_SOFT_MAX) { + const float * op_params = (const float *)softmax->op_params; + + float scale = op_params[0]; + float max_bias = op_params[1]; + + if (scale != 1.0f || max_bias != 0.0f) { + return false; + } + + // don't fuse when masks or sinks are present + if (softmax->src[1] || softmax->src[2]) { + return false; + } + } + + const int n_expert = softmax->ne[0]; + if (n_expert > (1 << (num_topk_moe_pipelines-1))) { + return false; + } + + if (!ctx->device->subgroup_arithmetic || + !ctx->device->subgroup_shuffle || + !ctx->device->subgroup_require_full_support || + ctx->device->disable_fusion) { + return false; + } + + return true; +} + +bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, + int node_idx) { + GGML_UNUSED(ctx); + const ggml_tensor *rope = cgraph->nodes[node_idx + 0]; + const ggml_tensor *view = cgraph->nodes[node_idx + 1]; + const ggml_tensor *set_rows = cgraph->nodes[node_idx + 2]; + + // ne3 not tested + if (rope->src[0]->ne[3] != 1) { + return false; + } + + if (set_rows->type != GGML_TYPE_F32 && set_rows->type != GGML_TYPE_F16) { + return false; + } + + if (set_rows->src[1]->type != GGML_TYPE_I64) { + return false; + } + + // The view should flatten two dims of rope into one dim + if (!ggml_is_contiguous(view) || + view->ne[0] != rope->ne[0] * rope->ne[1]) { + return false; + } + + // Only norm/neox/mrope shaders have the fusion code + const int mode = ((const int32_t *) rope->op_params)[2]; + if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_MROPE) { + return false; + } + + return true; +} + +bool ggml_vk_can_fuse_snake(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) { + GGML_UNUSED(ctx); + if (!ggml_can_fuse(cgraph, node_idx, snake_pattern)) { + return false; + } + + const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0]; + const ggml_tensor * sin_node = cgraph->nodes[node_idx + 1]; + const ggml_tensor * sqr = cgraph->nodes[node_idx + 2]; + const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3]; + const ggml_tensor * add = cgraph->nodes[node_idx + 4]; + + const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1]; + const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0]; + + const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0]; + const ggml_tensor * x_in_add = (add->src[0] == mul1) ? add->src[1] : add->src[0]; + + if (x_in_add != x) { + return false; + } + if (x->type != GGML_TYPE_F32 && x->type != GGML_TYPE_F16 && x->type != GGML_TYPE_BF16) { + return false; + } + // Shader bindings: data_a is A_TYPE so it follows x's precision, while + // data_b and data_c are hardcoded float, so the broadcast operands must + // be F32 regardless of x's type. + if (a->type != GGML_TYPE_F32) return false; + if (inv_b->type != GGML_TYPE_F32) return false; + // Chain intermediates and output share x's precision (single A_TYPE / D_TYPE pipeline). + if (mul0->type != x->type) return false; + if (sin_node->type != x->type) return false; + if (sqr->type != x->type) return false; + if (mul1->type != x->type) return false; + if (add->type != x->type) return false; + if (!ggml_are_same_shape(a, inv_b)) { + return false; + } + if (a->ne[0] != 1 || a->ne[1] != x->ne[1]) { + return false; + } + // Dispatch is 2D over (ne0, ne1), so x and add must be 2D and a / inv_b + // must collapse to [1, C, 1, 1]. Higher dims are not handled by the shader. + if (x->ne[2] != 1 || x->ne[3] != 1) return false; + if (add->ne[2] != 1 || add->ne[3] != 1) return false; + if (a->ne[2] != 1 || a->ne[3] != 1) return false; + if (inv_b->ne[2] != 1 || inv_b->ne[3] != 1) return false; + // Shader uses idx = i0 + i1 * ne0 and reads data_b[i1] / data_c[i1], + // so every operand must be contiguous. + if (!ggml_is_contiguous(x) || !ggml_is_contiguous(add) || + !ggml_is_contiguous(a) || !ggml_is_contiguous(inv_b)) { + return false; + } + return true; +} + +bool ggml_vk_tensors_overlap(const ggml_tensor * a, const ggml_tensor * b, bool elementwise) { + ggml_backend_vk_buffer_context * a_buf_ctx = (ggml_backend_vk_buffer_context *)a->buffer->context; + vk_buffer a_buf = a_buf_ctx->dev_buffer; + ggml_backend_vk_buffer_context * b_buf_ctx = (ggml_backend_vk_buffer_context *)b->buffer->context; + vk_buffer b_buf = b_buf_ctx->dev_buffer; + if (a_buf == b_buf) { + auto a_base = vk_tensor_offset(a) + a->view_offs; + auto a_size = ggml_nbytes(a); + auto b_base = vk_tensor_offset(b) + b->view_offs; + auto b_size = ggml_nbytes(b); + + if (elementwise && a_base == b_base && a_size == b_size) { + return false; + } + + if ((b_base <= a_base && a_base < b_base + b_size) || + (a_base <= b_base && b_base < a_base + a_size)) { + return true; + } + } + return false; +} + +bool ggml_vk_can_fuse_rms_norm_mul_rope(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, + int node_idx) { + GGML_UNUSED(ctx); + const ggml_tensor *rms = cgraph->nodes[node_idx + 0]; + const ggml_tensor *mul = cgraph->nodes[node_idx + 1]; + const ggml_tensor *rope = cgraph->nodes[node_idx + 2]; + + const int mode = ((const int32_t *) rope->op_params)[2]; + + // noncontig tensors aren't tested, and don't seem common in practice + if (!ggml_is_contiguous(rms) || + !ggml_is_contiguous(mul) || + !ggml_is_contiguous(rope)) { + return false; + } + + // only norm/neox are handled in the shader + if (mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_NORMAL) { + return false; + } + + // shared memory size for passing data from mul->rope + if (mul->ne[0] > 1024) { + return false; + } + + // conditions for pipeline creation + if (sizeof(vk_op_rms_norm_mul_rope_push_constants) > ctx->device->properties.limits.maxPushConstantsSize) { + return false; + } + + return true; +} + +uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) { + + const ggml_tensor *first_node = cgraph->nodes[node_idx]; + if (first_node->op != GGML_OP_ADD) { + return 0; + } + + if (!ctx->device->multi_add) { + return 0; + } + + int32_t num_adds = 1; + while (node_idx + num_adds < cgraph->n_nodes && + cgraph->nodes[node_idx + num_adds]->op == GGML_OP_ADD && + num_adds < MAX_FUSED_ADDS) { + num_adds++; + } + + // The shader currently requires same shapes (but different strides are allowed), + // everything f32, and no misalignment + for (int32_t i = 0; i < num_adds; ++i) { + const ggml_tensor *next_node = cgraph->nodes[node_idx + i]; + if (!ggml_are_same_shape(first_node, next_node->src[0]) || + !ggml_are_same_shape(first_node, next_node->src[1]) || + next_node->type != GGML_TYPE_F32 || + next_node->src[0]->type != GGML_TYPE_F32 || + next_node->src[1]->type != GGML_TYPE_F32 || + get_misalign_bytes(ctx, next_node) || + get_misalign_bytes(ctx, next_node->src[0]) || + get_misalign_bytes(ctx, next_node->src[1])) { + num_adds = i; + } + } + + // Verify we can fuse these + ggml_op adds[MAX_FUSED_ADDS]; + for (int32_t i = 0; i < num_adds; ++i) { + adds[i] = GGML_OP_ADD; + } + + // decrease num_adds if they can't all be fused + while (num_adds > 1 && !ggml_can_fuse(cgraph, node_idx, adds, num_adds)) { + num_adds--; + } + + // a single add is not "fused", so just return zero + if (num_adds == 1) { + return 0; + } + return num_adds; +} + +void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * graph) +{ + VK_LOG_DEBUG("ggml_vk_graph_optimize(" << graph->n_nodes << " nodes)"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + + if (ctx->device->disable_graph_optimize) { + return; + } + + auto const &is_empty = [](ggml_tensor * node) -> bool { + return node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; + }; + + auto const &is_src_of = [](const ggml_tensor *dst, const ggml_tensor *src) -> bool { + for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) { + if (dst->src[s] == src) { + return true; + } + } + // implicit dependency if they view the same tensor + const ggml_tensor *dst2 = dst->view_src ? dst->view_src : dst; + const ggml_tensor *src2 = src->view_src ? src->view_src : src; + if (dst2 == src2) { + return true; + } + return false; + }; + + std::vector new_order; + std::vector used(graph->n_nodes, false); + std::set used_node_set; + + int first_unused = 0; + while (first_unused < graph->n_nodes) { + std::vector current_set; + + // Check for fusion patterns and avoid reordering them + auto const &match_pattern = [&](const std::initializer_list &pattern, int start) -> bool { + if (start + (int)pattern.size() <= graph->n_nodes) { + bool is_pattern = true; + for (size_t j = 0; j < pattern.size(); ++j) { + if (graph->nodes[start + j]->op != pattern.begin()[j] || used[start + j]) { + is_pattern = false; + } + } + return is_pattern; + } + return false; + }; + + auto const &keep_pattern = [&](const std::initializer_list &pattern) -> bool { + if (match_pattern(pattern, first_unused)) { + for (size_t j = 0; j < pattern.size(); ++j) { + new_order.push_back(graph->nodes[first_unused + j]); + used_node_set.insert(graph->nodes[first_unused + j]); + used[first_unused + j] = true; + } + while (first_unused < graph->n_nodes && used[first_unused]) { + first_unused++; + } + return true; + } + return false; + }; + + if (keep_pattern(topk_moe_early_softmax_norm)) { + continue; + } + if (keep_pattern(topk_moe_sigmoid_norm_bias)) { + continue; + } + if (keep_pattern(topk_moe_early_softmax)) { + continue; + } + if (keep_pattern(topk_moe_late_softmax)) { + continue; + } + if (keep_pattern(snake_pattern)) { + continue; + } + + // First, grab the next unused node. + current_set.push_back(first_unused); + + // Loop through the next N nodes. Grab any that don't depend on other nodes that + // haven't already been run. Nodes that have already been run have used[i] set + // to true. Allow nodes that depend on the previous node if it's a fusion pattern + // that we support (e.g. RMS_NORM + MUL). + // This first pass only grabs "real" (non-view nodes). Second pass grabs view nodes. + // The goal is to not interleave real and view nodes in a way that breaks fusion. + const int NUM_TO_CHECK = 20; + for (int j = first_unused+1; j < std::min(first_unused + NUM_TO_CHECK, graph->n_nodes); ++j) { + if (used[j]) { + continue; + } + if (is_empty(graph->nodes[j])) { + continue; + } + // Don't pull forward nodes from fusion patterns + if (match_pattern(topk_moe_early_softmax_norm, j) || + match_pattern(topk_moe_sigmoid_norm_bias, j) || + match_pattern(topk_moe_early_softmax, j) || + match_pattern(topk_moe_late_softmax, j) || + match_pattern(snake_pattern, j)) { + continue; + } + bool ok = true; + for (int c = first_unused; c < j; ++c) { + if (!used[c] && + is_src_of(graph->nodes[j], graph->nodes[c]) && + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_RMS_NORM && graph->nodes[j]->op == GGML_OP_MUL) && + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT && graph->nodes[j]->op == GGML_OP_ADD) && + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_ADD_ID) && + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_MUL) && + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_ADD && graph->nodes[j]->op == GGML_OP_ADD) && + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_SSM_CONV && graph->nodes[j]->op == GGML_OP_ADD) && + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_SSM_CONV && graph->nodes[j]->op == GGML_OP_UNARY)) { + ok = false; + break; + } + } + if (ok) { + current_set.push_back(j); + + int rope_idx = j; + + // When we've found RMS_NORM + MUL, try to find a ROPE that uses it + if (j > 0 && + graph->nodes[j]->op == GGML_OP_MUL && + graph->nodes[j-1]->op == GGML_OP_RMS_NORM) { + for (int k = j + 1; k < std::min(j + 15, graph->n_nodes); ++k) { + if (graph->nodes[k]->op == GGML_OP_ROPE && + graph->nodes[k]->src[0] == graph->nodes[j] && + // Check that other srcs are already valid + graph->nodes[k]->src[1]->op == GGML_OP_NONE && + (graph->nodes[k]->src[2] == nullptr || graph->nodes[k]->src[2]->op == GGML_OP_NONE)) { + rope_idx = k; + current_set.push_back(rope_idx); + used[rope_idx] = true; + break; + } + } + } + // Look for ROPE + VIEW + SET_ROWS and make them consecutive + if (graph->nodes[rope_idx]->op == GGML_OP_ROPE) { + int view_idx = -1; + int set_rows_idx = -1; + for (int k = rope_idx+1; k < std::min(rope_idx + 10, graph->n_nodes); ++k) { + if (view_idx == -1 && + graph->nodes[k]->op == GGML_OP_VIEW && + graph->nodes[k]->src[0] == graph->nodes[rope_idx]) { + view_idx = k; + continue; + } + if (view_idx != -1 && + set_rows_idx == -1 && + graph->nodes[k]->op == GGML_OP_SET_ROWS && + graph->nodes[k]->src[0] == graph->nodes[view_idx]) { + set_rows_idx = k; + break; + } + } + if (set_rows_idx != -1) { + current_set.push_back(view_idx); + current_set.push_back(set_rows_idx); + used[view_idx] = true; + used[set_rows_idx] = true; + } + } + // Look for MUL_MAT_ID + ADD_ID + MUL + if (j > 0 && + graph->nodes[j]->op == GGML_OP_ADD_ID && + graph->nodes[j-1]->op == GGML_OP_MUL_MAT_ID) { + for (int k = j + 1; k < std::min(j + 15, graph->n_nodes); ++k) { + if (graph->nodes[k]->op == GGML_OP_MUL && + graph->nodes[k]->src[0] == graph->nodes[j] && + // src1 must either be weights or already processed + (graph->nodes[k]->src[1]->op == GGML_OP_NONE || used_node_set.find(graph->nodes[k]->src[1]) != used_node_set.end())) { + current_set.push_back(k); + used[k] = true; + break; + } + } + } + // Look for MUL_MAT + ADD + ADD + if (j > 0 && + graph->nodes[j]->op == GGML_OP_ADD && + graph->nodes[j-1]->op == GGML_OP_MUL_MAT) { + for (int k = j + 1; k < std::min(j + 15, graph->n_nodes); ++k) { + if (graph->nodes[k]->op == GGML_OP_ADD && + graph->nodes[k]->src[0] == graph->nodes[j] && + // src1 must either be weights or already processed + (graph->nodes[k]->src[1]->op == GGML_OP_NONE || used_node_set.find(graph->nodes[k]->src[1]) != used_node_set.end())) { + current_set.push_back(k); + used[k] = true; + break; + } + } + } + // SSM_CONV + ADD + UNARY: pull the consuming UNARY forward + if (j > 0 && + graph->nodes[j]->op == GGML_OP_ADD && + graph->nodes[j-1]->op == GGML_OP_SSM_CONV) { + for (int k = j + 1; k < std::min(j + 15, graph->n_nodes); ++k) { + if (graph->nodes[k]->op == GGML_OP_UNARY && + graph->nodes[k]->src[0] == graph->nodes[j]) { + current_set.push_back(k); + used[k] = true; + break; + } + } + } + } + } + // Second pass grabs view nodes. + // Skip this if it would break a fusion optimization (don't split up add->rms_norm or add->add). + if (graph->nodes[current_set.back()]->op != GGML_OP_ADD) { + for (int j = first_unused+1; j < std::min(first_unused + NUM_TO_CHECK, graph->n_nodes); ++j) { + if (used[j]) { + continue; + } + if (!is_empty(graph->nodes[j])) { + continue; + } + bool ok = true; + for (int c = first_unused; c < j; ++c) { + bool c_in_current_set = std::find(current_set.begin(), current_set.end(), c) != current_set.end(); + // skip views whose srcs haven't been processed. + if (!used[c] && + is_src_of(graph->nodes[j], graph->nodes[c]) && + !c_in_current_set) { + ok = false; + break; + } + } + if (ok) { + current_set.push_back(j); + } + } + } + + // Push the current set into new_order + for (auto c : current_set) { + new_order.push_back(graph->nodes[c]); + used_node_set.insert(graph->nodes[c]); + used[c] = true; + } + while (first_unused < graph->n_nodes && used[first_unused]) { + first_unused++; + } + } + // Replace the graph with the new order. + for (int i = 0; i < graph->n_nodes; ++i) { + graph->nodes[i] = new_order[i]; + } +} + diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-instance.cpp b/ggml/src/ggml-vulkan/ggml-vulkan-instance.cpp new file mode 100644 index 0000000000..952057ce1c --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-instance.cpp @@ -0,0 +1,1618 @@ +#include "ggml-vulkan-common.h" + +static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) { + vk::PhysicalDeviceProperties props = device.getProperties(); + + if (props.vendorID == VK_VENDOR_ID_AMD) { + const std::vector ext_props = device.enumerateDeviceExtensionProperties(); + + bool amd_shader_core_properties = false; + bool integer_dot_product = false; + bool subgroup_size_control = false; + + for (const auto& properties : ext_props) { + if (strcmp("VK_AMD_shader_core_properties", properties.extensionName) == 0) { + amd_shader_core_properties = true; + } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) { + integer_dot_product = true; + } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { + subgroup_size_control = true; + } + } + + if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control) { + return vk_device_architecture::OTHER; + } + + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceShaderCorePropertiesAMD shader_core_props_amd; + vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props; + vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; + + props2.pNext = &shader_core_props_amd; + shader_core_props_amd.pNext = &integer_dot_props; + integer_dot_props.pNext = &subgroup_size_control_props; + + device.getProperties2(&props2); + + if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 64) { + return vk_device_architecture::AMD_GCN; + } + if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 32) { + // RDNA + if (shader_core_props_amd.wavefrontsPerSimd == 20) { + return vk_device_architecture::AMD_RDNA1; + } + if (integer_dot_props.integerDotProduct4x8BitPackedMixedSignednessAccelerated) { + return vk_device_architecture::AMD_RDNA3; + } + return vk_device_architecture::AMD_RDNA2; + } + } else if (props.vendorID == VK_VENDOR_ID_INTEL) { + const std::vector ext_props = device.enumerateDeviceExtensionProperties(); + + bool subgroup_size_control = false; + + for (const auto& properties : ext_props) { + if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { + subgroup_size_control = true; + } + } + + if (!subgroup_size_control) { + return vk_device_architecture::OTHER; + } + + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; + + props2.pNext = &subgroup_size_control_props; + device.getProperties2(&props2); + + if (subgroup_size_control_props.minSubgroupSize == 16) { + // Xe2 architecture uses SIMD16 while previous Xe and Gen architecture uses SIMD8. + // Minimum subgroup size matches the SIMD width so we distinguish architecture by checking this value. + // https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html + // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html + return vk_device_architecture::INTEL_XE2; + } + } else if (props.vendorID == VK_VENDOR_ID_NVIDIA) { + const std::vector ext_props = device.enumerateDeviceExtensionProperties(); + + bool cooperative_matrix = false; + bool sm_builtins = false; + + // Detect "pre-turing" based on lack of coopmat support. + for (const auto& properties : ext_props) { + if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) { + cooperative_matrix = true; + } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) { + sm_builtins = true; + } + } + + if (!cooperative_matrix) { + return vk_device_architecture::NVIDIA_PRE_TURING; + } + + if (sm_builtins) { + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; + + props2.pNext = &sm_props; + + device.getProperties2(&props2); + + // Turing has 32, following architectures have 48 + if (sm_props.shaderWarpsPerSM == 32) { + return vk_device_architecture::NVIDIA_TURING; + } + } + } + return vk_device_architecture::OTHER; +} + +static bool vk_instance_initialized = false; + +vk_instance_t vk_instance; + +vk_device ggml_vk_get_device(size_t idx) { + VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")"); + + if (vk_instance.devices[idx] == nullptr) { + VK_LOG_DEBUG("Initializing new vk_device"); + vk_device device = std::make_shared(); + vk_instance.devices[idx] = device; + + device->memory_logger = std::unique_ptr(new vk_memory_logger()); + + size_t dev_num = vk_instance.device_indices[idx]; + + std::vector physical_devices = vk_instance.instance.enumeratePhysicalDevices(); + + if (dev_num >= physical_devices.size()) { + std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl; + throw std::runtime_error("Device not found"); + } + + device->physical_device = physical_devices[dev_num]; + const std::vector ext_props = device->physical_device.enumerateDeviceExtensionProperties(); + + device->architecture = get_device_architecture(device->physical_device); + + const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY"); + device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr; + + const char* GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM = getenv("GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM"); + device->disable_host_visible_vidmem = GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM != nullptr; + + const char* GGML_VK_ALLOW_SYSMEM_FALLBACK = getenv("GGML_VK_ALLOW_SYSMEM_FALLBACK"); + device->allow_sysmem_fallback = GGML_VK_ALLOW_SYSMEM_FALLBACK != nullptr; + + const char* GGML_VK_DISABLE_GRAPH_OPTIMIZE = getenv("GGML_VK_DISABLE_GRAPH_OPTIMIZE"); + device->disable_graph_optimize = GGML_VK_DISABLE_GRAPH_OPTIMIZE != nullptr; + + bool fp16_storage = false; + bool fp16_compute = false; + bool maintenance4_support = false; + bool sm_builtins = false; + bool amd_shader_core_properties2 = false; + bool pipeline_robustness = false; + bool coopmat2_support = false; + bool coopmat2_decode_vector_support = false; + bool pipeline_executable_properties_support = false; + device->coopmat_support = false; + device->integer_dot_product = false; + device->shader_64b_indexing = false; + bool bfloat16_support = false; + bool dot2_f16_support = false; + + for (const auto& properties : ext_props) { + if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { + maintenance4_support = true; + } else if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { + fp16_storage = true; + } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { + fp16_compute = true; + } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) { + sm_builtins = true; + } else if (strcmp("VK_AMD_shader_core_properties2", properties.extensionName) == 0) { + amd_shader_core_properties2 = true; + } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { + pipeline_robustness = true; + } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { + device->subgroup_size_control = true; +#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_COOPMAT")) { + device->coopmat_support = true; + device->coopmat_m = 0; + device->coopmat_n = 0; + device->coopmat_k = 0; +#endif +#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_COOPMAT2")) { + coopmat2_support = true; +#endif + } else if (strcmp(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME, properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_COOPMAT2_DECODE_VECTOR")) { + coopmat2_decode_vector_support = true; +#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) + } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) { + device->integer_dot_product = true; +#endif +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + } else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_BFLOAT16")) { + bfloat16_support = true; +#endif + } else if (strcmp("VK_VALVE_shader_mixed_float_dot_product", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_DOT2")) { + dot2_f16_support = true; + } else if (strcmp("VK_KHR_pipeline_executable_properties", properties.extensionName) == 0) { + pipeline_executable_properties_support = true; + } else if (strcmp("VK_EXT_memory_priority", properties.extensionName) == 0 && + getenv("GGML_VK_ENABLE_MEMORY_PRIORITY")) { + device->memory_priority = true; + } else if (strcmp("VK_EXT_external_memory_host", properties.extensionName) == 0) { + device->external_memory_host = true; +#if defined(VK_EXT_shader_64bit_indexing) + } else if (strcmp("VK_EXT_shader_64bit_indexing", properties.extensionName) == 0) { + device->shader_64b_indexing = true; +#endif + } + } + + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceMaintenance3Properties props3; + vk::PhysicalDeviceMaintenance4Properties props4; + vk::PhysicalDeviceSubgroupProperties subgroup_props; + vk::PhysicalDeviceDriverProperties driver_props; + vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; + vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props; + vk::PhysicalDeviceVulkan11Properties vk11_props; + vk::PhysicalDeviceVulkan12Properties vk12_props; + vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; + vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR shader_integer_dot_product_props; + vk::PhysicalDeviceExternalMemoryHostPropertiesEXT external_memory_host_props; + + props2.pNext = &props3; + props3.pNext = &subgroup_props; + subgroup_props.pNext = &driver_props; + driver_props.pNext = &vk11_props; + vk11_props.pNext = &vk12_props; + + VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_props; + + if (maintenance4_support) { + last_struct->pNext = (VkBaseOutStructure *)&props4; + last_struct = (VkBaseOutStructure *)&props4; + } + if (sm_builtins) { + last_struct->pNext = (VkBaseOutStructure *)&sm_props; + last_struct = (VkBaseOutStructure *)&sm_props; + } + if (amd_shader_core_properties2) { + last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props; + last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props; + } + if (device->subgroup_size_control) { + last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_props; + last_struct = (VkBaseOutStructure *)&subgroup_size_control_props; + } + +#if defined(VK_NV_cooperative_matrix2) + vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props; + if (coopmat2_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_props; + last_struct = (VkBaseOutStructure *)&coopmat2_props; + } +#endif + + if (device->integer_dot_product) { + last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_props; + last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_props; + } + + if (device->external_memory_host) { + last_struct->pNext = (VkBaseOutStructure *)&external_memory_host_props; + last_struct = (VkBaseOutStructure *)&external_memory_host_props; + } + + device->physical_device.getProperties2(&props2); + device->properties = props2.properties; + device->vendor_id = device->properties.vendorID; + device->driver_id = driver_props.driverID; + + if (device->driver_id == vk::DriverId::eMoltenvk) { + // Disable external_memory_host until https://github.com/KhronosGroup/MoltenVK/pull/2622 + // is available in the Vulkan SDK. + device->external_memory_host = false; + } + + // Implementing the async backend interfaces seems broken on older Intel HW, + // see https://github.com/ggml-org/llama.cpp/issues/17302. + device->support_async = (device->vendor_id != VK_VENDOR_ID_INTEL || + std::string(device->properties.deviceName.data()).find("(DG1)") == std::string::npos) && + getenv("GGML_VK_DISABLE_ASYNC") == nullptr; + + if (!device->support_async) { + GGML_LOG_DEBUG("ggml_vulkan: WARNING: Async execution disabled on certain Intel devices.\n"); + } + + const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE"); + + if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) { + device->max_memory_allocation_size = std::stoull(GGML_VK_FORCE_MAX_ALLOCATION_SIZE); + } else if (maintenance4_support) { + device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); + } else { + device->max_memory_allocation_size = props3.maxMemoryAllocationSize; + } + + const char* GGML_VK_FORCE_MAX_BUFFER_SIZE = getenv("GGML_VK_FORCE_MAX_BUFFER_SIZE"); + + if (GGML_VK_FORCE_MAX_BUFFER_SIZE != nullptr) { + device->max_buffer_size = std::stoull(GGML_VK_FORCE_MAX_BUFFER_SIZE); + } else if (maintenance4_support) { + device->max_buffer_size = props4.maxBufferSize; + } else { + device->max_buffer_size = device->max_memory_allocation_size; + } + + const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE"); + + if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) { + device->suballocation_block_size = std::stoull(GGML_VK_SUBALLOCATION_BLOCK_SIZE); + } else { + // Limit batching of allocations to 1GB by default to avoid fragmentation issues + device->suballocation_block_size = 1024*1024*1024; + } + device->suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size); + + device->subgroup_size = subgroup_props.subgroupSize; + device->subgroup_size_log2 = uint32_t(log2f(float(device->subgroup_size))); + device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; + if (sm_builtins) { + device->shader_core_count = sm_props.shaderSMCount; + } else if (amd_shader_core_properties2) { + device->shader_core_count = amd_shader_core_properties2_props.activeComputeUnitCount; + } else if (device->vendor_id == VK_VENDOR_ID_INTEL) { + device->shader_core_count = ggml_vk_intel_shader_core_count(device->physical_device); + } else { + device->shader_core_count = 0; + } + device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16; + + device->subgroup_basic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && + (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eBasic); + device->subgroup_arithmetic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && + (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic); +#ifdef __APPLE__ + // Workaround for subgroup arithmetic failing on MoltenVK with AMD GPUs (issue 15846) + if (device->vendor_id == VK_VENDOR_ID_AMD) { + device->subgroup_arithmetic = false; + } +#endif + device->subgroup_shuffle = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && + (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eShuffle); +#ifdef __APPLE__ + if (device->vendor_id == VK_VENDOR_ID_AMD) { + device->subgroup_shuffle = false; + } +#endif + device->subgroup_clustered = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && + (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eClustered); + + device->subgroup_ballot = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && + (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eBallot); + + device->subgroup_vote = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && + (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eVote); + + const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr; + + device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; + + if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props, device->architecture)) { + device->coopmat_support = false; + } + + device->integer_dot_product = device->integer_dot_product && shader_integer_dot_product_props.integerDotProduct4x8BitPackedSignedAccelerated; + + device->min_imported_host_pointer_alignment = external_memory_host_props.minImportedHostPointerAlignment; + + device->max_workgroup_size_log2 = uint32_t(log2f(float(device->properties.limits.maxComputeWorkGroupInvocations))); + + std::vector queue_family_props = device->physical_device.getQueueFamilyProperties(); + + // Try to find a non-graphics compute queue and transfer-focused queues + // Allow overriding avoiding the graphics queue because it can increase performance on RADV + const bool allow_graphics_queue = (getenv("GGML_VK_ALLOW_GRAPHICS_QUEUE") != nullptr); + const vk::QueueFlagBits graphics_flag = allow_graphics_queue ? (vk::QueueFlagBits)0 : vk::QueueFlagBits::eGraphics; + const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, graphics_flag, -1, 1); + const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | graphics_flag, compute_queue_family_index, 1); + + const float priorities[] = { 1.0f, 1.0f }; + device->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1; + + std::vector device_queue_create_infos; + if (compute_queue_family_index != transfer_queue_family_index) { + device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); + device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, priorities + 1}); + } else if(!device->single_queue) { + device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities}); + } else { + device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); + } + vk::DeviceCreateInfo device_create_info{}; + std::vector device_extensions; + vk::PhysicalDeviceFeatures device_features = device->physical_device.getFeatures(); + + VkPhysicalDeviceFeatures2 device_features2; + device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + device_features2.pNext = nullptr; + device_features2.features = (VkPhysicalDeviceFeatures)device_features; + + VkPhysicalDeviceVulkan11Features vk11_features; + vk11_features.pNext = nullptr; + vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; + device_features2.pNext = &vk11_features; + + VkPhysicalDeviceVulkan12Features vk12_features; + vk12_features.pNext = nullptr; + vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; + vk11_features.pNext = &vk12_features; + + last_struct = (VkBaseOutStructure *)&vk12_features; + + VkPhysicalDevicePipelineRobustnessFeaturesEXT pl_robustness_features; + pl_robustness_features.pNext = nullptr; + pl_robustness_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_FEATURES_EXT; + pl_robustness_features.pipelineRobustness = VK_FALSE; + + if (pipeline_robustness) { + last_struct->pNext = (VkBaseOutStructure *)&pl_robustness_features; + last_struct = (VkBaseOutStructure *)&pl_robustness_features; + device_extensions.push_back("VK_EXT_pipeline_robustness"); + } + + VkPhysicalDeviceMemoryPriorityFeaturesEXT memory_priority_features; + memory_priority_features.pNext = nullptr; + memory_priority_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PRIORITY_FEATURES_EXT; + memory_priority_features.memoryPriority = VK_FALSE; + if (device->memory_priority) { + last_struct->pNext = (VkBaseOutStructure *)&memory_priority_features; + last_struct = (VkBaseOutStructure *)&memory_priority_features; + device_extensions.push_back("VK_EXT_memory_priority"); + } + + VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features; + subgroup_size_control_features.pNext = nullptr; + subgroup_size_control_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT; + subgroup_size_control_features.computeFullSubgroups = false; + subgroup_size_control_features.subgroupSizeControl = false; + + if (device->subgroup_size_control) { + last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_features; + last_struct = (VkBaseOutStructure *)&subgroup_size_control_features; + } + +#if defined(VK_KHR_cooperative_matrix) + VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features; + coopmat_features.pNext = nullptr; + coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; + coopmat_features.cooperativeMatrix = VK_FALSE; + + if (device->coopmat_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat_features; + last_struct = (VkBaseOutStructure *)&coopmat_features; + } +#endif + +#if defined(VK_NV_cooperative_matrix2) + VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {}; + coopmat2_features.pNext = nullptr; + coopmat2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV; + if (coopmat2_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_features; + last_struct = (VkBaseOutStructure *)&coopmat2_features; + device_extensions.push_back("VK_NV_cooperative_matrix2"); + } +#endif + + VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {}; + coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV; + if (coopmat2_decode_vector_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_decode_vector_features; + last_struct = (VkBaseOutStructure *)&coopmat2_decode_vector_features; + device_extensions.push_back(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME); + } + +#if defined(VK_KHR_shader_bfloat16) + VkPhysicalDeviceShaderBfloat16FeaturesKHR bfloat16_features {}; + bfloat16_features.pNext = nullptr; + bfloat16_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR; + if (bfloat16_support) { + last_struct->pNext = (VkBaseOutStructure *)&bfloat16_features; + last_struct = (VkBaseOutStructure *)&bfloat16_features; + device_extensions.push_back("VK_KHR_shader_bfloat16"); + } +#endif + + VkPhysicalDeviceMaintenance4Features maint4_features {}; + maint4_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES; + if (maintenance4_support) { + last_struct->pNext = (VkBaseOutStructure *)&maint4_features; + last_struct = (VkBaseOutStructure *)&maint4_features; + device_extensions.push_back("VK_KHR_maintenance4"); + } + + VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR shader_integer_dot_product_features {}; + shader_integer_dot_product_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR; + if (device->integer_dot_product) { + last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_features; + last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_features; + device_extensions.push_back("VK_KHR_shader_integer_dot_product"); + } + + VkPhysicalDeviceShaderMixedFloatDotProductFeaturesVALVE dot2_features {}; + dot2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MIXED_FLOAT_DOT_PRODUCT_FEATURES_VALVE; + if (dot2_f16_support) { + last_struct->pNext = (VkBaseOutStructure *)&dot2_features; + last_struct = (VkBaseOutStructure *)&dot2_features; + device_extensions.push_back("VK_VALVE_shader_mixed_float_dot_product"); + } + + VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR pep_features {}; + pep_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR; + if (pipeline_executable_properties_support) { + last_struct->pNext = (VkBaseOutStructure *)&pep_features; + last_struct = (VkBaseOutStructure *)&pep_features; + device_extensions.push_back("VK_KHR_pipeline_executable_properties"); + } + + if (device->external_memory_host) { + device_extensions.push_back("VK_EXT_external_memory_host"); + } + +#if defined(VK_EXT_shader_64bit_indexing) + VkPhysicalDeviceShader64BitIndexingFeaturesEXT shader_64bit_indexing_features {}; + shader_64bit_indexing_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_64_BIT_INDEXING_FEATURES_EXT; + if (device->shader_64b_indexing) { + last_struct->pNext = (VkBaseOutStructure *)&shader_64bit_indexing_features; + last_struct = (VkBaseOutStructure *)&shader_64bit_indexing_features; + device_extensions.push_back("VK_EXT_shader_64bit_indexing"); + } +#endif + + vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2); + + device->pipeline_executable_properties_support = pipeline_executable_properties_support; + + device->fp16 = device->fp16 && vk12_features.shaderFloat16; + +#if defined(VK_KHR_shader_bfloat16) + device->bf16 = bfloat16_support && bfloat16_features.shaderBFloat16Type; +#else + device->bf16 = false; +#endif + + device->dot2_f16 = dot2_f16_support && dot2_features.shaderMixedFloatDotProductFloat16AccFloat32; + + device->pipeline_robustness = pl_robustness_features.pipelineRobustness; + + device->multi_add = vk12_props.shaderRoundingModeRTEFloat16 && + device->properties.limits.maxPushConstantsSize >= sizeof(vk_op_multi_add_push_constants) && + getenv("GGML_VK_DISABLE_MULTI_ADD") == nullptr; + + device->shader_int64 = device_features2.features.shaderInt64; + device->buffer_device_address = vk12_features.bufferDeviceAddress; + device->vulkan_memory_model = vk12_features.vulkanMemoryModel; + + if (device->subgroup_size_control) { + device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize; + device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize; + device_extensions.push_back("VK_EXT_subgroup_size_control"); + } + + device->subgroup_size_control = device->subgroup_size_control && + (subgroup_size_control_props.requiredSubgroupSizeStages & vk::ShaderStageFlagBits::eCompute) && + subgroup_size_control_features.subgroupSizeControl; + + device->subgroup_require_full_support = subgroup_size_control_features.computeFullSubgroups; + +#if defined(VK_KHR_cooperative_matrix) + device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix; + device->coopmat1_fa_support = device->coopmat_support && device->subgroup_require_full_support; +#endif + + if (coopmat2_support) { +#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + if (coopmat2_features.cooperativeMatrixWorkgroupScope && + coopmat2_features.cooperativeMatrixFlexibleDimensions && + coopmat2_features.cooperativeMatrixReductions && + coopmat2_features.cooperativeMatrixConversions && + coopmat2_features.cooperativeMatrixPerElementOperations && + coopmat2_features.cooperativeMatrixTensorAddressing && + coopmat2_features.cooperativeMatrixBlockLoads && + vk12_features.bufferDeviceAddress) { + + std::vector flexible_dimensions; + uint32_t count = 0; + + PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV + _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV = + (PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV) + vk_instance.instance.getProcAddr("vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV"); + + _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(device->physical_device, &count, nullptr); + + VkCooperativeMatrixFlexibleDimensionsPropertiesNV empty_prop {}; + empty_prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_FLEXIBLE_DIMENSIONS_PROPERTIES_NV; + flexible_dimensions.resize(count, empty_prop); + + _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(device->physical_device, &count, flexible_dimensions.data()); + + bool found_fp16_128 = false, + found_fp16_256 = false, + found_fp32_128 = false, + found_fp32_256 = false; + bool found_bf16_128 = false, + found_bf16_256 = false; + // need to support fp16*fp16 with fp16/fp32 accumulator, for workgroupsize 128 + // with 32x16x16 and 256 with 32x32x16. + for (auto &prop : flexible_dimensions) { + if (prop.saturatingAccumulation == VK_FALSE && + prop.scope == VK_SCOPE_WORKGROUP_KHR) { + + if (prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + + if (prop.workgroupInvocations == 128 && + prop.MGranularity <= 32 && + prop.NGranularity <= 16 && + prop.KGranularity <= 16) { + if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + found_fp16_128 = true; + } + if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { + found_fp32_128 = true; + } + } + if (prop.workgroupInvocations == 256 && + prop.MGranularity <= 32 && + prop.NGranularity <= 32 && + prop.KGranularity <= 16) { + if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + found_fp16_256 = true; + } + if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { + found_fp32_256 = true; + } + } + } + +#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (prop.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && + prop.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR && + prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { + + if (prop.workgroupInvocations == 128 && + prop.MGranularity <= 32 && + prop.NGranularity <= 16 && + prop.KGranularity <= 16) { + found_bf16_128 = true; + } + if (prop.workgroupInvocations == 256 && + prop.MGranularity <= 32 && + prop.NGranularity <= 32 && + prop.KGranularity <= 16) { + found_bf16_256 = true; + } + } +#endif + } + } + if (found_fp16_128 && found_fp16_256 && + found_fp32_128 && found_fp32_256 && + coopmat2_props.cooperativeMatrixFlexibleDimensionsMaxDimension >= 512) { + device->coopmat2 = true; + device->coopmat2_bf16_support = found_bf16_128 && found_bf16_256; + device->coopmat2_decode_vector = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector; + } + } +#endif + } + + if (!vk11_features.storageBuffer16BitAccess) { + std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl; + throw std::runtime_error("Unsupported device"); + } + + device_extensions.push_back("VK_KHR_16bit_storage"); + +#ifdef GGML_VULKAN_VALIDATE + device_extensions.push_back("VK_KHR_shader_non_semantic_info"); +#endif + + if (device->fp16) { + device_extensions.push_back("VK_KHR_shader_float16_int8"); + } + +#if defined(VK_KHR_cooperative_matrix) + if (device->coopmat_support) { + // Query supported shapes + std::vector cm_props; + + PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = + (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)vkGetInstanceProcAddr(vk_instance.instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR"); + + uint32_t cm_props_num; + + pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(device->physical_device, &cm_props_num, nullptr); + + cm_props.resize(cm_props_num); + + for (auto& prop : cm_props) { + prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR; + } + + pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(device->physical_device, &cm_props_num, cm_props.data()); + + VK_LOG_DEBUG("ggml_vulkan: Cooperative Matrix Shapes: " << cm_props.size()); + + for (auto& prop : cm_props) { + VK_LOG_DEBUG("ggml_vulkan: M: " << prop.MSize << " N: " << prop.NSize << " K: " << prop.KSize << " A: " << vk::to_string((vk::ComponentTypeKHR)prop.AType) << " B: " << vk::to_string((vk::ComponentTypeKHR)prop.BType) << " C: " << vk::to_string((vk::ComponentTypeKHR)prop.CType) << " Result: " << vk::to_string((vk::ComponentTypeKHR)prop.ResultType) << " saturatingAccumulation: " << prop.saturatingAccumulation << " scope: " << vk::to_string((vk::ScopeKHR)prop.scope)); + + if ((vk::ComponentTypeKHR)prop.AType == vk::ComponentTypeKHR::eFloat16 && + (vk::ComponentTypeKHR)prop.BType == vk::ComponentTypeKHR::eFloat16 && + (vk::ScopeKHR)prop.scope == vk::ScopeKHR::eSubgroup + ) { + if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat32 && + (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat32) { + // coopmat sizes not set yet + if (device->coopmat_m == 0) { + device->coopmat_acc_f32_support = true; + device->coopmat_m = prop.MSize; + device->coopmat_n = prop.NSize; + device->coopmat_k = prop.KSize; + } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) { + // Only enable if shape is identical + device->coopmat_acc_f32_support = true; + } + if (prop.MSize == 16 && prop.NSize == 16 && prop.KSize == 16) { + device->coopmat_support_16x16x16_f32acc = true; + } + } else if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat16 && + (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat16) { + // coopmat sizes not set yet + if (device->coopmat_m == 0) { + device->coopmat_acc_f16_support = true; + device->coopmat_m = prop.MSize; + device->coopmat_n = prop.NSize; + device->coopmat_k = prop.KSize; + } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) { + // Only enable if shape is identical + device->coopmat_acc_f16_support = true; + } + if (prop.MSize == 16 && prop.NSize == 16 && prop.KSize == 16) { + device->coopmat_support_16x16x16_f16acc = true; + } + } + } else if ((vk::ComponentTypeKHR)prop.AType == vk::ComponentTypeKHR::eSint8 && + (vk::ComponentTypeKHR)prop.BType == vk::ComponentTypeKHR::eSint8 && + (vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eSint32 && + (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eSint32 && + (vk::ScopeKHR)prop.scope == vk::ScopeKHR::eSubgroup && + device->coopmat_int_m == 0 + ) { + device->coopmat_int_support = true; + device->coopmat_int_m = prop.MSize; + device->coopmat_int_n = prop.NSize; + device->coopmat_int_k = prop.KSize; + } +#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (prop.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && + prop.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR && + prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR && + (vk::ScopeKHR)prop.scope == vk::ScopeKHR::eSubgroup + ) { + // coopmat sizes not set yet + if (device->coopmat_m == 0) { + device->coopmat_bf16_support = true; + device->coopmat_m = prop.MSize; + device->coopmat_n = prop.NSize; + device->coopmat_k = prop.KSize; + } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) { + // Only enable if shape is identical + device->coopmat_bf16_support = true; + } + } +#endif + } + + if (device->coopmat_m == 0 || !device->coopmat_acc_f32_support) { + // No suitable matmul mode found + GGML_LOG_DEBUG("ggml_vulkan: WARNING: No suitable matrix core mode found. Disabling matrix cores.\n"); + device->coopmat_support = false; + } + if (getenv("GGML_VK_DISABLE_BFLOAT16")) { + device->coopmat_bf16_support = false; + } + } + + if (device->coopmat_support) { + device_extensions.push_back("VK_KHR_cooperative_matrix"); + } +#if defined(VK_KHR_shader_bfloat16) + if (device->coopmat_bf16_support) { + device_extensions.push_back("VK_KHR_shader_bfloat16"); + } +#endif +#endif + device->name = GGML_VK_NAME + std::to_string(idx); + + device_create_info + .setFlags(vk::DeviceCreateFlags()) + .setQueueCreateInfos(device_queue_create_infos) + .setPEnabledExtensionNames(device_extensions); + device_create_info.setPNext(&device_features2); + device->device = device->physical_device.createDevice(device_create_info); + + // Queues + ggml_vk_create_queue(device, device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }, false); + + // Shaders + // Disable matmul tile sizes early if performance low or not supported + for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) { + switch (device->vendor_id) { +#ifndef GGML_VULKAN_RUN_TESTS + case VK_VENDOR_ID_AMD: + device->mul_mat_l[i] = device->coopmat_support && device->driver_id != vk::DriverId::eAmdProprietary; + device->mul_mat_m[i] = true; + device->mul_mat_s[i] = true; + device->mul_mat_id_l[i] = false; + device->mul_mat_id_m[i] = true; + device->mul_mat_id_s[i] = true; + break; + case VK_VENDOR_ID_INTEL: { + // Current Windows driver does not expose BF16 support. + // We only want to use l_warptile if coopmat is available and is Xe2+ + const bool xe2_with_coopmat = device->coopmat_support && device->architecture == INTEL_XE2; + const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && xe2_with_coopmat) : xe2_with_coopmat; + device->mul_mat_l[i] = use_l_warptile; + device->mul_mat_id_l[i] = use_l_warptile; + device->mul_mat_m[i] = true; + device->mul_mat_s[i] = true; + device->mul_mat_id_m[i] = true; + device->mul_mat_id_s[i] = true; + break; + } + case VK_VENDOR_ID_APPLE: + device->mul_mat_l[i] = false; + device->mul_mat_m[i] = true; + device->mul_mat_s[i] = false; + device->mul_mat_id_l[i] = false; + device->mul_mat_id_m[i] = true; + device->mul_mat_id_s[i] = false; + break; +#endif + default: + device->mul_mat_l[i] = true; + device->mul_mat_m[i] = true; + device->mul_mat_s[i] = true; + device->mul_mat_id_l[i] = true; + device->mul_mat_id_m[i] = true; + device->mul_mat_id_s[i] = true; + break; + } + +#if VK_HEADER_VERSION >= 287 + // Honeykrisp driver for Asahi Linux doesn't report VK_VENDOR_ID_APPLE. + // Check for Honeykrisp driver and force same configuration as the VK_VENDOR_ID_APPLE case. + if (device->driver_id == vk::DriverId::eMesaHoneykrisp) { + device->mul_mat_l[i] = false; + device->mul_mat_m[i] = true; + device->mul_mat_s[i] = false; + device->mul_mat_id_l[i] = false; + device->mul_mat_id_m[i] = true; + device->mul_mat_id_s[i] = false; + } +#endif + + device->mul_mat_l_int[i] = device->mul_mat_l[i]; + device->mul_mat_m_int[i] = device->mul_mat_m[i]; + device->mul_mat_s_int[i] = device->mul_mat_s[i]; + device->mul_mat_id_l_int[i] = device->mul_mat_id_l[i]; + device->mul_mat_id_m_int[i] = device->mul_mat_id_m[i]; + device->mul_mat_id_s_int[i] = device->mul_mat_id_s[i]; + } + + + std::vector dsl_binding; + std::vector dsl_binding_flags; + for (uint32_t i = 0; i < MAX_PARAMETER_COUNT; i++) { + dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute}); + dsl_binding_flags.push_back({}); + } + + vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags }; + + vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info( + {}, + dsl_binding); + descriptor_set_layout_create_info.setPNext(&dslbfci); + device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info); + + ggml_vk_load_shaders(device); + + // Prefer a dedicated transfer queue on AMD dGPUs (non-GCN) when graphics queue use is disabled. + const bool prefers_transfer_queue = + device->vendor_id == VK_VENDOR_ID_AMD && + device->architecture != AMD_GCN && + !device->uma && + !allow_graphics_queue; + + if (!device->single_queue) { + const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; + ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true); + + device->async_use_transfer_queue = prefers_transfer_queue || (getenv("GGML_VK_ASYNC_USE_TRANSFER_QUEUE") != nullptr); + } else { + // TODO: Use pointer or reference to avoid copy + device->transfer_queue.copyFrom(device->compute_queue); + device->transfer_queue.cmd_pool.init(device, &device->transfer_queue); + + device->async_use_transfer_queue = false; + } + + device->buffer_type = { + /* .iface = */ ggml_backend_vk_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), idx), + /* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device }, + }; + + device->fence = device->device.createFence({}); + + device->idx = idx; + + device->disable_fusion = getenv("GGML_VK_DISABLE_FUSION") != nullptr; + + device->add_rms_fusion = !device->disable_fusion && + device->subgroup_arithmetic && + device->vendor_id != VK_VENDOR_ID_INTEL; + device->partials_binding_alignment = + std::max(4u, (uint32_t)device->properties.limits.minStorageBufferOffsetAlignment); + + device->mmvq_mode = 0; + if (getenv("GGML_VK_DISABLE_MMVQ")) { + device->mmvq_mode = -1; + } else if (getenv("GGML_VK_FORCE_MMVQ")) { + device->mmvq_mode = 1; + } + + return device; + } + + return vk_instance.devices[idx]; +} + +static void ggml_vk_print_gpu_info(size_t idx) { + GGML_ASSERT(idx < vk_instance.device_indices.size()); + size_t dev_num = vk_instance.device_indices[idx]; + VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")"); + GGML_ASSERT(vk_instance_initialized); + + std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); + + if (dev_num >= devices.size()) { + std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl; + throw std::runtime_error("Device not found"); + } + + vk::PhysicalDevice physical_device = devices[dev_num]; + std::vector ext_props = physical_device.enumerateDeviceExtensionProperties(); + + bool fp16_storage = false; + bool fp16_compute = false; + bool coopmat_support = false; + bool coopmat2_support = false; + bool coopmat2_decode_vector_support = false; + bool integer_dot_product = false; + bool bfloat16_support = false; + bool dot2_f16_support = false; + + for (auto properties : ext_props) { + if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { + fp16_storage = true; + } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { + fp16_compute = true; +#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_COOPMAT")) { + coopmat_support = true; +#endif +#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_COOPMAT2")) { + coopmat2_support = true; +#endif + } else if (strcmp(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME, properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_COOPMAT2_DECODE_VECTOR")) { + coopmat2_decode_vector_support = true; +#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) + } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) { + integer_dot_product = true; +#endif +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + } else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_BFLOAT16")) { + bfloat16_support = true; +#endif + } else if (strcmp("VK_VALVE_shader_mixed_float_dot_product", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_DOT2")) { + dot2_f16_support = true; + } + } + + const vk_device_architecture device_architecture = get_device_architecture(physical_device); + + const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16"); + bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr; + + bool fp16 = !force_disable_f16 && fp16_storage && fp16_compute; + + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceMaintenance3Properties props3; + vk::PhysicalDeviceSubgroupProperties subgroup_props; + vk::PhysicalDeviceDriverProperties driver_props; + vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR shader_integer_dot_product_props; + props2.pNext = &props3; + props3.pNext = &subgroup_props; + subgroup_props.pNext = &driver_props; + + // Pointer to the last chain element + VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&driver_props; + + if (integer_dot_product) { + last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_props; + last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_props; + } + + physical_device.getProperties2(&props2); + + VkPhysicalDeviceFeatures2 device_features2; + device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + device_features2.pNext = nullptr; + + VkPhysicalDeviceVulkan11Features vk11_features; + vk11_features.pNext = nullptr; + vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; + device_features2.pNext = &vk11_features; + + VkPhysicalDeviceVulkan12Features vk12_features; + vk12_features.pNext = nullptr; + vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; + vk11_features.pNext = &vk12_features; + + // Pointer to the last chain element + last_struct = (VkBaseOutStructure *)&vk12_features; + +#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features; + coopmat_features.pNext = nullptr; + coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; + coopmat_features.cooperativeMatrix = VK_FALSE; + + if (coopmat_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat_features; + last_struct = (VkBaseOutStructure *)&coopmat_features; + } +#endif + + VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR shader_integer_dot_product_features {}; + shader_integer_dot_product_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR; + if (integer_dot_product) { + last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_features; + last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_features; + } + +#if defined(VK_KHR_shader_bfloat16) + VkPhysicalDeviceShaderBfloat16FeaturesKHR bfloat16_features {}; + bfloat16_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR; + if (bfloat16_support) { + last_struct->pNext = (VkBaseOutStructure *)&bfloat16_features; + last_struct = (VkBaseOutStructure *)&bfloat16_features; + } +#endif + +#if defined(VK_NV_cooperative_matrix2) + VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {}; + coopmat2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV; + if (coopmat2_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_features; + last_struct = (VkBaseOutStructure *)&coopmat2_features; + } +#endif + + VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {}; + coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV; + if (coopmat2_decode_vector_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_decode_vector_features; + last_struct = (VkBaseOutStructure *)&coopmat2_decode_vector_features; + } + + VkPhysicalDeviceShaderMixedFloatDotProductFeaturesVALVE dot2_features {}; + dot2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MIXED_FLOAT_DOT_PRODUCT_FEATURES_VALVE; + if (dot2_f16_support) { + last_struct->pNext = (VkBaseOutStructure *)&dot2_features; + last_struct = (VkBaseOutStructure *)&dot2_features; + } + + vkGetPhysicalDeviceFeatures2(physical_device, &device_features2); + + fp16 = fp16 && vk12_features.shaderFloat16; + +#if defined(VK_KHR_shader_bfloat16) + bool bf16 = bfloat16_support && bfloat16_features.shaderBFloat16Type; +#else + bool bf16 = false; +#endif + + uint32_t default_subgroup_size = get_subgroup_size("", device_architecture); + const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize; + const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; + + integer_dot_product = integer_dot_product + && shader_integer_dot_product_props.integerDotProduct4x8BitPackedSignedAccelerated + && shader_integer_dot_product_features.shaderIntegerDotProduct; + + coopmat_support = coopmat_support +#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + && coopmat_features.cooperativeMatrix +#endif + && ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture); + +#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + coopmat2_support = coopmat2_support && + coopmat2_features.cooperativeMatrixWorkgroupScope && + coopmat2_features.cooperativeMatrixFlexibleDimensions && + coopmat2_features.cooperativeMatrixReductions && + coopmat2_features.cooperativeMatrixConversions && + coopmat2_features.cooperativeMatrixPerElementOperations && + coopmat2_features.cooperativeMatrixTensorAddressing && + coopmat2_features.cooperativeMatrixBlockLoads; +#else + coopmat2_support = false; +#endif + + coopmat2_decode_vector_support = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector; +#if !defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT) + coopmat2_decode_vector_support = false; +#endif + + std::string matrix_cores = coopmat2_support ? (coopmat2_decode_vector_support ? "NV_coopmat2v" : "NV_coopmat2") + : coopmat_support ? "KHR_coopmat" + : "none"; + + bool dot2_f16 = dot2_f16_support && dot2_features.shaderMixedFloatDotProductFloat16AccFloat32; + const char *fp16_str = fp16 ? (dot2_f16 ? "dot2" : "1") : "0"; + + std::string device_name = props2.properties.deviceName.data(); + GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %s | bf16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n", + idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16_str, bf16, subgroup_size, + props2.properties.limits.maxComputeSharedMemorySize, integer_dot_product, matrix_cores.c_str()); + + if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { + GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n"); + } +} + +static DispatchLoaderDynamic ggml_vk_default_dispatcher_instance; + +DispatchLoaderDynamic & ggml_vk_default_dispatcher() { + return ggml_vk_default_dispatcher_instance; +} + +void ggml_vk_instance_init() { + if (vk_instance_initialized) { + return; + } + VK_LOG_DEBUG("ggml_vk_instance_init()"); + + // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers- + ggml_vk_default_dispatcher_instance.init(vkGetInstanceProcAddr); + + uint32_t api_version = vk::enumerateInstanceVersion(); + + if (api_version < VK_API_VERSION_1_2) { + std::cerr << "ggml_vulkan: Error: Vulkan 1.2 required." << std::endl; + throw vk::SystemError(vk::Result::eErrorFeatureNotPresent, "Vulkan 1.2 required"); + } + + vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, api_version }; + + const std::vector instance_extensions = vk::enumerateInstanceExtensionProperties(); + const bool layer_settings = ggml_vk_instance_layer_settings_available(); +#ifdef __APPLE__ + const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions); +#endif + const bool debug_utils_ext = ggml_vk_instance_debug_utils_ext_available(instance_extensions) && getenv("GGML_VK_DEBUG_MARKERS") != nullptr; + std::vector layers; + + if (layer_settings) { + layers.push_back("VK_LAYER_KHRONOS_validation"); + } + std::vector extensions; + if (layer_settings) { + extensions.push_back("VK_EXT_layer_settings"); + } +#ifdef __APPLE__ + if (portability_enumeration_ext) { + extensions.push_back("VK_KHR_portability_enumeration"); + } +#endif + if (debug_utils_ext) { + extensions.push_back("VK_EXT_debug_utils"); + } + VkBool32 enable_best_practice = layer_settings; + std::vector settings = { + { + "VK_LAYER_KHRONOS_validation", + "validate_best_practices", + vk::LayerSettingTypeEXT::eBool32, + 1, + &enable_best_practice + }, + }; + vk::LayerSettingsCreateInfoEXT layer_setting_info(settings); + vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions, &layer_setting_info); +#ifdef __APPLE__ + if (portability_enumeration_ext) { + instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR; + } +#endif + + vk_instance.instance = vk::createInstance(instance_create_info); + vk_instance_initialized = true; + + if (debug_utils_ext) { + vk_instance.debug_utils_support = true; + vk_instance.pfn_vkSetDebugUtilsObjectNameEXT = (PFN_vkSetDebugUtilsObjectNameEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkSetDebugUtilsObjectNameEXT"); + vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT = (PFN_vkQueueBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueBeginDebugUtilsLabelEXT"); + vk_instance.pfn_vkQueueEndDebugUtilsLabelEXT = (PFN_vkQueueEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueEndDebugUtilsLabelEXT"); + vk_instance.pfn_vkCmdBeginDebugUtilsLabelEXT = (PFN_vkCmdBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdBeginDebugUtilsLabelEXT"); + vk_instance.pfn_vkCmdEndDebugUtilsLabelEXT = (PFN_vkCmdEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdEndDebugUtilsLabelEXT"); + vk_instance.pfn_vkCmdInsertDebugUtilsLabelEXT = (PFN_vkCmdInsertDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdInsertDebugUtilsLabelEXT"); + } + + vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr; + vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr; + vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr; + vk_memory_logger_enabled = getenv("GGML_VK_MEMORY_LOGGER") != nullptr; + const char* GGML_VK_PIPELINE_STATS = getenv("GGML_VK_PIPELINE_STATS"); + if (GGML_VK_PIPELINE_STATS != nullptr) { + vk_pipeline_stats_filter = GGML_VK_PIPELINE_STATS; + } + const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY"); + + if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) { + vk_perf_logger_frequency = std::stoul(GGML_VK_PERF_LOGGER_FREQUENCY); + } + + // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers- + VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance); + + std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); + + // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan + char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES"); + if (devices_env != nullptr) { + size_t num_available_devices = devices.size(); + + std::string devices(devices_env); + std::replace(devices.begin(), devices.end(), ',', ' '); + + std::stringstream ss(devices); + size_t tmp; + while (ss >> tmp) { + if(tmp >= num_available_devices) { + std::cerr << "ggml_vulkan: Invalid device index " << tmp << " in GGML_VK_VISIBLE_DEVICES." << std::endl; + throw std::runtime_error("Invalid Vulkan device index"); + } + vk_instance.device_indices.push_back(tmp); + } + } else { + // If no vulkan devices are found, return early + if (devices.empty()) { + GGML_LOG_INFO("ggml_vulkan: No devices found.\n"); + return; + } + + // Default to using all dedicated GPUs + for (size_t i = 0; i < devices.size(); i++) { + vk::PhysicalDeviceProperties2 new_props; + vk::PhysicalDeviceDriverProperties new_driver; + vk::PhysicalDeviceIDProperties new_id; + new_props.pNext = &new_driver; + new_driver.pNext = &new_id; + devices[i].getProperties2(&new_props); + + if ((new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) && ggml_vk_device_is_supported(devices[i])) { + // Check if there are two physical devices corresponding to the same GPU + // This handles the case where the same GPU appears with different drivers (e.g., RADV + AMDVLK on Linux), + // see https://github.com/ggml-org/llama.cpp/pull/7582 for original deduplication. + // MoltenVK on macOS may report the same UUID for distinct GPUs on multi-GPU cards, + // see https://github.com/KhronosGroup/MoltenVK/issues/2683. Skip when both old/new + // driver is MoltenVK + auto old_device = std::find_if( + vk_instance.device_indices.begin(), + vk_instance.device_indices.end(), + [&devices, &new_id, &new_driver](const size_t k){ + vk::PhysicalDeviceProperties2 old_props; + vk::PhysicalDeviceDriverProperties old_driver; + vk::PhysicalDeviceIDProperties old_id; + old_props.pNext = &old_driver; + old_driver.pNext = &old_id; + devices[k].getProperties2(&old_props); + + bool same_uuid = std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID)); + same_uuid = same_uuid || ( + old_id.deviceLUIDValid && new_id.deviceLUIDValid && + std::equal(std::begin(old_id.deviceLUID), std::end(old_id.deviceLUID), std::begin(new_id.deviceLUID)) + ); + bool both_molten_vk = (new_driver.driverID == vk::DriverId::eMoltenvk && old_driver.driverID == vk::DriverId::eMoltenvk); + + return same_uuid && !both_molten_vk; + } + ); + if (old_device == vk_instance.device_indices.end()) { + vk_instance.device_indices.push_back(i); + } else { + // There can be two physical devices corresponding to the same GPU if there are 2 different drivers + // This can cause error when splitting layers aross the devices, need to keep only 1 + VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same deviceUUID"); + + vk::PhysicalDeviceProperties2 old_props; + vk::PhysicalDeviceDriverProperties old_driver; + old_props.pNext = &old_driver; + devices[*old_device].getProperties2(&old_props); + + std::map driver_priorities {}; + int old_priority = std::numeric_limits::max(); + int new_priority = std::numeric_limits::max(); + + // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id + // Smaller number -> higher priority + switch (old_props.properties.vendorID) { + case VK_VENDOR_ID_AMD: + driver_priorities[vk::DriverId::eMesaRadv] = 1; + driver_priorities[vk::DriverId::eAmdOpenSource] = 2; + driver_priorities[vk::DriverId::eAmdProprietary] = 3; + break; + case VK_VENDOR_ID_INTEL: + driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1; + driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2; + break; + case VK_VENDOR_ID_NVIDIA: + driver_priorities[vk::DriverId::eNvidiaProprietary] = 1; +#if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235 + driver_priorities[vk::DriverId::eMesaNvk] = 2; +#endif + break; + case VK_VENDOR_ID_QUALCOMM: + driver_priorities[vk::DriverId::eQualcommProprietary] = 1; + driver_priorities[vk::DriverId::eMesaTurnip] = 2; + break; + } + driver_priorities[vk::DriverId::eMesaDozen] = 100; + + if (driver_priorities.count(old_driver.driverID)) { + old_priority = driver_priorities[old_driver.driverID]; + } + if (driver_priorities.count(new_driver.driverID)) { + new_priority = driver_priorities[new_driver.driverID]; + } + + if (new_priority < old_priority) { + auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device); + vk_instance.device_indices.erase(r, vk_instance.device_indices.end()); + vk_instance.device_indices.push_back(i); + + VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName); + } + else { + VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl); + } + } + } + } + + // If no GPUs found, fall back to the first non-CPU device. + // If only CPU devices are available, return without devices. + if (vk_instance.device_indices.empty()) { + for (size_t i = 0; i < devices.size(); i++) { + if (devices[i].getProperties().deviceType != vk::PhysicalDeviceType::eCpu) { + vk_instance.device_indices.push_back(i); + break; + } + } + } + + if (vk_instance.device_indices.empty()) { + GGML_LOG_INFO("ggml_vulkan: No devices found.\n"); + return; + } + } + GGML_LOG_DEBUG("ggml_vulkan: Found %zu Vulkan devices:\n", vk_instance.device_indices.size()); + + for (size_t i = 0; i < vk_instance.device_indices.size(); i++) { + vk::PhysicalDevice vkdev = devices[vk_instance.device_indices[i]]; + std::vector extensionprops = vkdev.enumerateDeviceExtensionProperties(); + + bool membudget_supported = false; + for (const auto & ext : extensionprops) { + if (strcmp(VK_EXT_MEMORY_BUDGET_EXTENSION_NAME, ext.extensionName) == 0) { + membudget_supported = true; + break; + } + } + + vk_instance.device_supports_membudget.push_back(membudget_supported); + + ggml_vk_print_gpu_info(i); + } +} + +void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { + VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")"); + ggml_vk_instance_init(); + GGML_ASSERT(idx < vk_instance.device_indices.size()); + + ctx->name = GGML_VK_NAME + std::to_string(idx); + + ctx->device = ggml_vk_get_device(idx); + + ctx->semaphore_idx = 0; + ctx->event_idx = 0; + + ctx->prealloc_size_x = 0; + ctx->prealloc_size_y = 0; + ctx->prealloc_size_split_k = 0; + // Fixed size of 1KB, for deterministic behavior + ctx->prealloc_size_add_rms_partials = 1024; + + ctx->fence = ctx->device->device.createFence({}); + ctx->almost_ready_fence = ctx->device->device.createFence({}); + + ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue); + if (ctx->device->async_use_transfer_queue) { + vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; + vk::SemaphoreCreateInfo ci{}; + ci.setPNext(&tci); + ctx->transfer_semaphore.s = ctx->device->device.createSemaphore(ci); + ctx->transfer_semaphore.value = 0; + + ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue); + } + + if (vk_perf_logger_enabled) { + ctx->perf_logger = std::unique_ptr(new vk_perf_logger()); + } + +#ifdef GGML_VULKAN_CHECK_RESULTS + const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS"); + vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks)); + const char* output_tensor = getenv("GGML_VULKAN_OUTPUT_TENSOR"); + vk_output_tensor = (output_tensor == NULL ? 0 : atoi(output_tensor)); +#endif +} + +int ggml_vk_get_device_count() { + ggml_vk_instance_init(); + + return vk_instance.device_indices.size(); +} + +void ggml_vk_get_device_description(int device, char * description, size_t description_size) { + ggml_vk_instance_init(); + + std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); + + vk::PhysicalDeviceProperties props; + devices[device].getProperties(&props); + + snprintf(description, description_size, "%s", props.deviceName.data()); +} + +bool ggml_vk_instance_layer_settings_available() { +#ifdef GGML_VULKAN_VALIDATE + // Check if validation layer provides the extension + const std::string layer_name = "VK_LAYER_KHRONOS_validation"; + for (const auto& layer : vk::enumerateInstanceLayerProperties()) { + if (layer_name == layer.layerName.data()) { + for (const auto& ext : vk::enumerateInstanceExtensionProperties(layer_name)) { + if (strcmp("VK_EXT_layer_settings", ext.extensionName.data()) == 0) { + return true; + } + } + } + } + + std::cerr << "ggml_vulkan: WARNING: Validation layer or layer extension VK_EXT_layer_settings not found." << std::endl; +#endif + return false; +} + +bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector& instance_extensions) { +#ifdef __APPLE__ + // Check for portability enumeration extension for MoltenVK support + for (const auto& properties : instance_extensions) { + if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) { + return true; + } + } + std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl; +#endif + return false; + + UNUSED(instance_extensions); +} + +bool ggml_vk_instance_debug_utils_ext_available( + const std::vector & instance_extensions) { + // Check for portability enumeration extension for MoltenVK support + for (const auto & properties : instance_extensions) { + if (strcmp("VK_EXT_debug_utils", properties.extensionName) == 0) { + return true; + } + } + + std::cerr << "ggml_vulkan: WARNING: Instance extension VK_EXT_debug_utils not found." << std::endl; + return false; + + UNUSED(instance_extensions); +} + +bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) { + VkPhysicalDeviceFeatures2 device_features2; + device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + + VkPhysicalDeviceVulkan11Features vk11_features; + vk11_features.pNext = nullptr; + vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; + device_features2.pNext = &vk11_features; + + vkGetPhysicalDeviceFeatures2(vkdev, &device_features2); + + return vk11_features.storageBuffer16BitAccess; +} + +bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) { + switch (props.vendorID) { + case VK_VENDOR_ID_INTEL: + // Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost, + // while some older hardware (ex. Arc A770) has performance regressions + return arch == vk_device_architecture::INTEL_XE2; + case VK_VENDOR_ID_AMD: + if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) { + // Workaround for AMD proprietary driver reporting support on all GPUs + return arch == vk_device_architecture::AMD_RDNA3; + } + return true; + default: + return true; + } +} + +uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev) { + VkPhysicalDeviceProperties2 props = vkdev.getProperties2(); + + if (props.properties.vendorID != VK_VENDOR_ID_INTEL) { + return 0; + } + + const uint32_t device_id = props.properties.deviceID; + + switch (device_id) { + case 0x56A6: // A310 + return 6; + case 0x5693: // A370M + case 0x56A5: // A380 + case 0x56B1: // Pro A40/A50 + return 8; + case 0x5697: // A530M + return 12; + case 0x5692: // A550M + case 0x56B3: // Pro A60 + return 16; + case 0x56A2: // A580 + return 24; + case 0x5691: // A730M + case 0x56A1: // A750 + return 28; + case 0x56A0: // A770 + case 0x5690: // A770M + return 32; + case 0xE212: // Pro B50 + return 16; + case 0xE20C: // B570 + return 18; + case 0xE20B: // B580 + case 0xE211: // Pro B60 + return 20; + default: + return 0; + } +} + diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-matmul.cpp b/ggml/src/ggml-vulkan/ggml-vulkan-matmul.cpp new file mode 100644 index 0000000000..dba315b7fb --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-matmul.cpp @@ -0,0 +1,2207 @@ +#include "ggml-vulkan-common.h" + +vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) { + VK_LOG_DEBUG("ggml_vk_get_to_fp16()"); + switch (type) { + case GGML_TYPE_F32: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_MXFP4: + case GGML_TYPE_NVFP4: + break; + default: + return nullptr; + } + + return ctx->device->pipeline_dequant[type]; +} + +static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) { + VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ", " << prec << ")"); + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_f32; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_f32_f16; + } + if (src0_type == GGML_TYPE_BF16 && src1_type == GGML_TYPE_BF16) { + return ctx->device->pipeline_matmul_bf16; + } + if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) { + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_f16_f32.f16acc; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_f16.f16acc; + } + } else { + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_f16_f32.f32acc; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_f16.f32acc; + } + } + + // MMQ + if (src1_type == GGML_TYPE_Q8_1) { + vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f32acc; + + if (pipelines->is_empty()) { + return nullptr; + } + + return pipelines; + } + + if (src1_type != GGML_TYPE_F32 && !ctx->device->coopmat2) { + return nullptr; + } + + switch (src0_type) { + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_MXFP4: + case GGML_TYPE_NVFP4: + break; + default: + return nullptr; + } + + if (ctx->device->coopmat2) { + assert(src1_type == GGML_TYPE_F16); + return prec == GGML_PREC_DEFAULT ? ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f32acc; + } + if (ctx->device->coopmat_support) { + return (ctx->device->fp16 && ctx->device->coopmat_acc_f16_support && prec == GGML_PREC_DEFAULT) ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc; + } + return (ctx->device->fp16 && prec == GGML_PREC_DEFAULT) ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc; +} + +static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t num_cols, uint32_t m, uint32_t k) { + VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()"); + GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16 || b_type == GGML_TYPE_Q8_1); + GGML_ASSERT(num_cols >= 1 && num_cols <= mul_mat_vec_max_cols); + + if (b_type == GGML_TYPE_Q8_1) { + switch (a_type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_MXFP4: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + break; + default: + return nullptr; + } + } + + switch (a_type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_MXFP4: + case GGML_TYPE_NVFP4: + break; + default: + return nullptr; + } + + // heuristic to choose workgroup size + uint32_t dmmv_wg = DMMV_WG_SIZE_SUBGROUP; + if ((ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA && ctx->device->architecture != vk_device_architecture::NVIDIA_PRE_TURING) || ctx->device->vendor_id == VK_VENDOR_ID_INTEL) { + // Prefer larger workgroups when M is small, to spread the work out more + // and keep more SMs busy. + // q6_k seems to prefer small workgroup size even for "medium" values of M. + if (a_type == GGML_TYPE_Q6_K) { + if (m < 4096 && k >= 1024) { + dmmv_wg = DMMV_WG_SIZE_LARGE; + } + } else { + if (m <= 8192 && k >= 1024) { + dmmv_wg = DMMV_WG_SIZE_LARGE; + } + } + } + + if (b_type == GGML_TYPE_Q8_1) { + if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL) { + dmmv_wg = DMMV_WG_SIZE_SUBGROUP; + } + return ctx->device->pipeline_dequant_mul_mat_vec_q8_1_f32[dmmv_wg][a_type][num_cols-1]; + } + + return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[dmmv_wg][a_type][num_cols-1] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[dmmv_wg][a_type][num_cols-1]; +} + +static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) { + VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()"); + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_id_f32; + } + if (src0_type == GGML_TYPE_BF16 && src1_type == GGML_TYPE_BF16) { + return ctx->device->pipeline_matmul_id_bf16; + } + if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) { + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_id_f16_f32.f16acc; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_id_f16.f16acc; + } + } else { + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_id_f16_f32.f32acc; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_id_f16.f32acc; + } + } + + // MMQ + if (src1_type == GGML_TYPE_Q8_1) { + vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_id_q8_1[src0_type].f32acc; + + if (pipelines->is_empty()) { + return nullptr; + } + + return pipelines; + } + + GGML_ASSERT(src1_type == GGML_TYPE_F32 || (ctx->device->coopmat2 && src1_type == GGML_TYPE_F16)); + + switch (src0_type) { + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_MXFP4: + case GGML_TYPE_NVFP4: + break; + default: + return nullptr; + } + + vk_matmul_pipeline2& mmp = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type]; + // XXX TODO 'prec' is not actually allowed in mul_mat_id. + bool prefer_fp16acc = ctx->device->fp16 /*&& prec == GGML_PREC_DEFAULT*/; + bool support_fp16acc = !mmp.f16acc->is_empty(); + bool support_fp32acc = !mmp.f32acc->is_empty(); + + if (support_fp16acc && (prefer_fp16acc || !support_fp32acc)) { + return mmp.f16acc; + } else { + GGML_ASSERT(support_fp32acc); + return mmp.f32acc; + } +} + +static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t m, uint32_t k) { + VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec_id()"); + GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_Q8_1); + + if (b_type == GGML_TYPE_Q8_1) { + switch (a_type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_MXFP4: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + break; + default: + return nullptr; + } + } + + switch (a_type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_MXFP4: + case GGML_TYPE_NVFP4: + break; + default: + return nullptr; + } + + // heuristic to choose workgroup size + uint32_t dmmv_wg = DMMV_WG_SIZE_SUBGROUP; + if ((ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA && ctx->device->architecture != vk_device_architecture::NVIDIA_PRE_TURING) || ctx->device->vendor_id == VK_VENDOR_ID_INTEL) { + // Prefer larger workgroups when M is small, to spread the work out more + // and keep more SMs busy. + // q6_k seems to prefer small workgroup size even for "medium" values of M. + if (a_type == GGML_TYPE_Q6_K) { + if (m < 4096 && k >= 1024) { + dmmv_wg = DMMV_WG_SIZE_LARGE; + } + } else { + if (m <= 8192 && k >= 1024) { + dmmv_wg = DMMV_WG_SIZE_LARGE; + } + } + } + + if (b_type == GGML_TYPE_Q8_1) { + if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL) { + dmmv_wg = DMMV_WG_SIZE_SUBGROUP; + } + return ctx->device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[dmmv_wg][a_type]; + } + + return ctx->device->pipeline_dequant_mul_mat_vec_id_f32[dmmv_wg][a_type]; +} + +static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, uint32_t m, uint32_t n, uint32_t k, bool disable_split_k, const vk_pipeline& pipeline) { + VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ", " << disable_split_k << ")"); + + if (disable_split_k) { + return 1; + } + + uint32_t split_k = 1; + if (ctx->device->shader_core_count != 0 && m >= pipeline->wg_denoms[0] && n >= pipeline->wg_denoms[1]) { + // If k is 'large' and the SMs will fill less than halfway, use split_k. + uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]); + uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]); + + if (k >= 2048) { + if (m_tiles * n_tiles <= ctx->device->shader_core_count / 2) { + split_k = ctx->device->shader_core_count / (m_tiles * n_tiles); + } else if (m_tiles * n_tiles <= ctx->device->shader_core_count * 2 / 3) { + split_k = 3; + } + // Cap the split at 8x. Unless k is huge this is a lot of overhead. + split_k = std::min(split_k, 8u); + + // ggml_vk_matmul will align the splits to be a multiple of 256. + // If this rounded up size would cause the last split to be empty, + // then reduce the split count. + while (true) { + if (split_k == 1) { + break; + } + uint32_t k_split = CEIL_DIV(k, split_k); + k_split = ROUNDUP_POW2(k_split, 256); + if (k_split * (split_k - 1) < k) { + break; + } + split_k--; + } + } + } + + return split_k; +} + +static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type, ggml_type src1_type) { + VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); + + // The q8_1 (integer dot) mmq path uses a different shader with its own + // shared-memory layout, so use the int-specific availability flags. + const bool is_q8_1 = (src1_type == GGML_TYPE_Q8_1); + const bool mm_l = is_q8_1 ? ctx->device->mul_mat_l_int[src0_type] : ctx->device->mul_mat_l[src0_type]; + const bool mm_m = is_q8_1 ? ctx->device->mul_mat_m_int[src0_type] : ctx->device->mul_mat_m[src0_type]; + const bool mm_s = is_q8_1 ? ctx->device->mul_mat_s_int[src0_type] : ctx->device->mul_mat_s[src0_type]; + + if (ctx->device->coopmat2) { + const uint32_t shader_core_count = ctx->device->shader_core_count; + const uint32_t tiles_l = CEIL_DIV(m, mmp->a_l->wg_denoms[0]) * CEIL_DIV(n, mmp->a_l->wg_denoms[1]); + const uint32_t tiles_m = CEIL_DIV(m, mmp->a_m->wg_denoms[0]) * CEIL_DIV(n, mmp->a_m->wg_denoms[1]); + + // Use large shader when the N dimension is greater than the medium shader's tile size + uint32_t crossover_large = mmp->m->wg_denoms[1]; + + // Prefer large over medium if either: + // - medium or large tiles would overfill the GPU + // - large tiles with a split_k==3 fits in the GPU and medium tiles with split_k==2 does not + // (medium with split_k==2 is probably better if it fits - more workgroups running and less split_k overhead) + bool prefer_large = tiles_m > shader_core_count || tiles_l > shader_core_count || + // split_k==3 with large tiles likely better than medium tiles with no split_k. + (tiles_l <= shader_core_count / 3 && tiles_m > shader_core_count / 2); + + if ((mm_l && (n > crossover_large && prefer_large)) || (!mm_m && !mm_s)) { + return aligned ? mmp->a_l : mmp->l; + } + // Use medium shader when the N dimension is greater than the small shader's tile size + uint32_t crossover_medium = mmp->s->wg_denoms[1]; + if ((mm_m && (n > crossover_medium)) || !mm_s) { + return aligned ? mmp->a_m : mmp->m; + } + return aligned ? mmp->a_s : mmp->s; + } + + if ((mm_s && (m <= 32 || n <= 32)) || (!mm_m && !mm_l)) { + return aligned ? mmp->a_s : mmp->s; + } + if ((mm_m && (m <= 64 || n <= 64)) || !mm_l) { + return aligned ? mmp->a_m : mmp->m; + } + return aligned ? mmp->a_l : mmp->l; +} + +static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type, ggml_type src1_type) { + VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); + return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true, src0_type, src1_type)->align; +} + +void ggml_vk_matmul( + ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, + vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, + uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, + uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, + uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, + uint32_t padded_n) { + VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", padded_n: " << padded_n << ")"); + if (split_k == 1) { + ggml_pipeline_request_descriptor_sets(ctx, pipeline, CEIL_DIV(batch, ctx->device->properties.limits.maxComputeWorkGroupCount[2])); + + uint32_t base_work_group_z = 0; + while (base_work_group_z < batch) { + uint32_t groups_z = std::min(batch - base_work_group_z, ctx->device->properties.limits.maxComputeWorkGroupCount[2]); + + const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k, ne02, ne12, broadcast2, broadcast3, padded_n }; + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, groups_z }); + base_work_group_z += groups_z; + } + return; + } + + if (ctx->prealloc_split_k_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + + GGML_ASSERT(batch_stride_d == m * n); + + // Round the split size up to a multiple of 256 (k-quant alignment) + uint32_t k_split = CEIL_DIV(k, split_k); + k_split = ROUNDUP_POW2(k_split, 256); + + ggml_pipeline_request_descriptor_sets(ctx, pipeline, CEIL_DIV(batch, ctx->device->properties.limits.maxComputeWorkGroupCount[2])); + + uint32_t base_work_group_z = 0; + while (base_work_group_z < batch) { + uint32_t groups_z = std::min(batch - base_work_group_z, ctx->device->properties.limits.maxComputeWorkGroupCount[2]); + + const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k_split, ne02, ne12, broadcast2, broadcast3, padded_n }; + // Make sure enough workgroups get assigned for split k to work + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, groups_z }); + base_work_group_z += groups_z; + } + ggml_vk_sync_buffers(ctx, subctx); + const std::array pc2 = { (uint32_t)(m * n * batch), split_k }; + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 }); + ctx->prealloc_split_k_need_sync = true; +} + +static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type, ggml_type src1_type) { + VK_LOG_DEBUG("ggml_vk_guess_matmul_id_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); + + // The q8_1 (integer dot) mmq path uses a different shader with its own + // shared-memory layout, so use the int-specific availability flags. + const bool is_q8_1 = (src1_type == GGML_TYPE_Q8_1); + const bool mm_l = is_q8_1 ? ctx->device->mul_mat_id_l_int[src0_type] : ctx->device->mul_mat_id_l[src0_type]; + const bool mm_m = is_q8_1 ? ctx->device->mul_mat_id_m_int[src0_type] : ctx->device->mul_mat_id_m[src0_type]; + const bool mm_s = is_q8_1 ? ctx->device->mul_mat_id_s_int[src0_type] : ctx->device->mul_mat_id_s[src0_type]; + + if (ctx->device->coopmat2) { + // Use large shader when the N dimension is greater than the medium shader's tile size + uint32_t crossover_large = mmp->m->wg_denoms[1]; + if ((mm_l && (n > crossover_large)) || (!mm_m && !mm_s)) { + return aligned ? mmp->a_l : mmp->l; + } + // Use medium shader when the N dimension is greater than the small shader's tile size + uint32_t crossover_medium = mmp->s->wg_denoms[1]; + if ((mm_m && (n > crossover_medium)) || !mm_s) { + return aligned ? mmp->a_m : mmp->m; + } + return aligned ? mmp->a_s : mmp->s; + } + + if ((mm_s && (m <= 32 || n <= 32)) || (!mm_m && !mm_l)) { + return aligned ? mmp->a_s : mmp->s; + } + if ((mm_m && (m <= 64 || n <= 64)) || !mm_l) { + return aligned ? mmp->a_m : mmp->m; + } + return aligned ? mmp->a_l : mmp->l; +} + +static uint32_t ggml_vk_guess_matmul_id_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type, ggml_type src1_type) { + VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); + return ggml_vk_guess_matmul_id_pipeline(ctx, mmp, m, n, true, src0_type, src1_type)->align; +} + +static void ggml_vk_matmul_id( + ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, + vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids, const vk_subbuffer & expert_count_buf, + uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, + uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, + uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11, + uint32_t padded_n) { + VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), expert_count: (" << expert_count_buf.buffer->buffer << ", " << expert_count_buf.offset << ", " << expert_count_buf.size << "), " << + "m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " << + "batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " << + "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")"); + const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, + nei0, nei1, nbi1, ne11, padded_n }; + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids, expert_count_buf }, pc, { m, nei1, n_as }); +} + +bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { + return + tensor->nb[0] == ggml_type_size(tensor->type) && + tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && + (tensor->ne[3] == 1 || tensor->nb[3] == tensor->nb[2]*tensor->ne[2]); +} + +vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) { + + // Choose "contiguous copy" shader if src/dst are contiguous + bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst)); + + // Use optimized "transpose" shader if src dim1 is the innermost dimension. + bool transpose = dst && src->nb[1] == ggml_type_size(to) && ggml_are_same_shape(dst, src); + + if (transpose && src->type == to) { + if (ggml_type_size(to) == 4) { + return ctx->device->pipeline_cpy_transpose_32; + } else if (ggml_type_size(to) == 2) { + return ctx->device->pipeline_cpy_transpose_16; + } + } + + if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) { + if (contig) { + return ctx->device->pipeline_contig_cpy_f32_f32; + } else { + return ctx->device->pipeline_cpy_f32_f32; + } + } + if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F16) { + if (contig) { + return ctx->device->pipeline_contig_cpy_f32_f16; + } else { + return ctx->device->pipeline_cpy_f32_f16; + } + } + if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F16) { + if (contig) { + return ctx->device->pipeline_contig_cpy_f16_f16; + } else { + return ctx->device->pipeline_cpy_f16_f16; + } + } + if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F32) { + if (contig) { + return ctx->device->pipeline_contig_cpy_f16_f32; + } else { + return ctx->device->pipeline_cpy_f16_f32; + } + } + if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_BF16) { + if (contig) { + return ctx->device->pipeline_contig_cpy_f32_bf16; + } else { + return ctx->device->pipeline_cpy_f32_bf16; + } + } + if (src->type == GGML_TYPE_BF16 && to == GGML_TYPE_F32) { + if (contig) { + return ctx->device->pipeline_contig_cpy_bf16_f32; + } else { + return ctx->device->pipeline_cpy_bf16_f32; + } + } + if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_I32) { + if (contig) { + return ctx->device->pipeline_contig_cpy_f32_i32; + } else { + return ctx->device->pipeline_cpy_f32_i32; + } + } + if (src->type == GGML_TYPE_I32 && to == GGML_TYPE_F32) { + if (contig) { + return ctx->device->pipeline_contig_cpy_i32_f32; + } else { + return ctx->device->pipeline_cpy_i32_f32; + } + } + if (src->type == GGML_TYPE_F32) { + switch (to) { + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_IQ4_NL: + return ctx->device->pipeline_cpy_f32_quant[to]; + default: + break; + } + } + + if (to == GGML_TYPE_F32) { + switch (src->type) { + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_IQ4_NL: + return ctx->device->pipeline_cpy_quant_f32[src->type]; + default: + break; + } + } + + if (src->type == to) { + // Copy two or four bytes at a time, depending on block size. + // For quantized types, we scale by block size/type size. But + // this path is also used for bf16->bf16 for example, where the + // type size must be exactly 2 or 4. + GGML_ASSERT(ggml_is_quantized(to) || ggml_type_size(src->type) == 2 || ggml_type_size(src->type) == 4); + if ((ggml_type_size(src->type) % 4) == 0) { + if (contig) { + return ctx->device->pipeline_contig_cpy_f32_f32; + } else { + return ctx->device->pipeline_cpy_f32_f32; + } + } else { + if (contig) { + return ctx->device->pipeline_contig_cpy_f16_f16; + } else { + return ctx->device->pipeline_cpy_f16_f16; + } + } + } + + std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl; + GGML_ABORT("fatal error"); +} + +static void ggml_vk_cpy_to_strided( + ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, + const vk_subbuffer & in, const vk_subbuffer & out, + uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13) { + VK_LOG_DEBUG("ggml_vk_cpy_to_strided((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; + std::cerr << "dst_nb=(" << nb10 << ", " << nb11 << ", " << nb12 << ", " << nb13 << "), buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); + + const uint32_t ne = ggml_nelements(tensor); + std::array elements; + + if (ne > 262144) { + elements = { 512, 512, CEIL_DIV(ne, 262144) }; + } else if (ne > 512) { + elements = { 512, CEIL_DIV(ne, 512), 1 }; + } else { + elements = { ne, 1, 1 }; + } + + vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne); + pc.nb10 = nb10; + pc.nb11 = nb11; + pc.nb12 = nb12; + pc.nb13 = nb13; + init_pushconst_fastdiv(pc); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements); + ggml_vk_sync_buffers(ctx, subctx); +} + +vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) { + switch(type) { + case GGML_TYPE_Q8_1: + return ctx->device->pipeline_quantize_q8_1_x4; + default: + std::cerr << "Missing quantize pipeline for type: " << ggml_type_name(type) << std::endl; + GGML_ABORT("fatal error"); + } +} + +void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& subctx, const vk_subbuffer & in, const vk_subbuffer & out, uint32_t ne) { + VK_LOG_DEBUG("ggml_vk_quantize_q8_1(" << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ", " << ne << ")"); + + vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); + + const uint32_t num_blocks = CEIL_DIV(ne, pipeline->wg_denoms[0]); + // clamp the number of elements to the max workgroup count. The shader will iterate over the total number of blocks. + const uint64_t max_elements = std::min(uint64_t{ctx->device->properties.limits.maxComputeWorkGroupCount[0]} * pipeline->wg_denoms[0], std::numeric_limits::max()); + const uint32_t elements = std::min(ne, static_cast(max_elements)); + + const vk_quantize_q8_1_push_constants pc = { + ne, + num_blocks, + }; + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, { elements, 1, 1 }); + ggml_vk_sync_buffers(ctx, subctx); +} + +static vk_pipeline ggml_vk_get_64b_indexing_pipeline(ggml_backend_vk_context * ctx, vk_pipeline &pipeline) { + GGML_UNUSED(ctx); +#if defined(VK_EXT_shader_64bit_indexing) + vk_pipeline *ptr = &pipeline; + while (*ptr) { + if ((*ptr)->is_64b_indexing) { + return *ptr; + } + ptr = &(*ptr)->next; + } +#endif + return pipeline; +} + +static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool disable_split_k) { + VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << ggml_type_name(src0->type) << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << ggml_type_name(src1->type) << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << ggml_type_name(dst->type) << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << "))"); + GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT + GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT + + const uint64_t ne00 = src0->ne[0]; + const uint64_t ne01 = src0->ne[1]; + const uint64_t ne02 = src0->ne[2]; + const uint64_t ne03 = src0->ne[3]; + + const uint64_t ne10 = src1->ne[0]; + const uint64_t ne11 = src1->ne[1]; + const uint64_t ne12 = src1->ne[2]; + const uint64_t ne13 = src1->ne[3]; + + const uint64_t ne21 = dst->ne[1]; + const uint32_t stride_d = dst->nb[1] / ggml_type_size(dst->type); + const uint32_t stride_batch_d = stride_d*ne21; + + const uint64_t r2 = ne12 / ne02; + const uint64_t r3 = ne13 / ne03; + + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; + + vk_buffer d_Qx = nullptr; + size_t qx_buf_offset = 0; + vk_buffer d_Qy = nullptr; + size_t qy_buf_offset = 0; + + bool src0_uma = false; + bool src1_uma = false; + + if (ctx->device->uma) { + ggml_vk_host_get(ctx->device, src0->data, d_Qx, qx_buf_offset); + ggml_vk_host_get(ctx->device, src1->data, d_Qy, qy_buf_offset); + src0_uma = d_Qx != nullptr; + src1_uma = d_Qy != nullptr; + } + + // Reformat and convert to fp16 if non-contiguous, or for coopmat2 for better perf + const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) || + !ggml_vk_dim01_contiguous(src0); + const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) || + (src0->type == GGML_TYPE_BF16 && src1->type != GGML_TYPE_BF16) || + !ggml_vk_dim01_contiguous(src1); + + // If src0 is BF16, try to use a BF16 x BF16 multiply + ggml_type f16_type = src0->type == GGML_TYPE_BF16 ? GGML_TYPE_BF16 : GGML_TYPE_F16; + + const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; + + bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0; + + // Check for mmq first + vk_matmul_pipeline mmp = quantize_y ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, GGML_TYPE_Q8_1, (ggml_prec)dst->op_params[0]) : nullptr; + + if (mmp == nullptr) { + // Fall back to f16 dequant mul mat + mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? f16_type : src1->type, (ggml_prec)dst->op_params[0]); + quantize_y = false; + } + + const bool qx_needs_dequant = mmp == nullptr || x_non_contig; + const bool qy_needs_dequant = !quantize_y && ((src1->type != f16_type && !y_f32_kernel) || y_non_contig); + + if (qx_needs_dequant) { + // Fall back to dequant + f16 mulmat + mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, f16_type, y_f32_kernel ? GGML_TYPE_F32 : f16_type, (ggml_prec)dst->op_params[0]); + } + + // Not implemented + GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT + + const ggml_type effective_src1_type = quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type); + + const uint32_t kpad = quantize_y ? 0 : ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, qx_needs_dequant ? f16_type : src0->type, effective_src1_type)); + const bool aligned = !quantize_y && ne10 == kpad && ne01 > 8 && ne11 > 8; + + vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? f16_type : src0->type, effective_src1_type); + + if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { + pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); + } + + // Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking + uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) : ne11; + const uint64_t x_ne = ggml_nelements(src0); + // 128 elements per Q8_1 x4 block + const uint64_t y_ne = padded_n * ne10 * ne12 * ne13; + const uint64_t d_ne = ggml_nelements(dst); + + const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, disable_split_k, pipeline); + + const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); + const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); + const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne; + const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); + const uint64_t d_sz = sizeof(float) * d_ne; + + vk_pipeline to_fp16_vk_0 = nullptr; + vk_pipeline to_fp16_vk_1 = nullptr; + vk_pipeline to_q8_1 = nullptr; + + if (x_non_contig) { + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, f16_type); + } else { + to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); + } + if (y_non_contig) { + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, f16_type); + } else { + to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); + } + GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT + GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT + + if (quantize_y) { + to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); + } + + { + const uint64_t split_k_size = split_k > 1 ? d_sz * split_k : 0; + if ( + (qx_needs_dequant && x_sz > ctx->device->properties.limits.maxStorageBufferRange) || + (qy_needs_dequant && y_sz > ctx->device->properties.limits.maxStorageBufferRange) || + (split_k > 1 && split_k_size > ctx->device->properties.limits.maxStorageBufferRange)) { + GGML_ABORT("Requested preallocation size is too large"); + } + if (qx_needs_dequant && ctx->prealloc_size_x < x_sz) { + ctx->prealloc_size_x = x_sz; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz) { + ctx->prealloc_size_y = y_sz; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if (split_k > 1 && ctx->prealloc_size_split_k < split_k_size) { + ctx->prealloc_size_split_k = split_k_size; + ggml_vk_preallocate_buffers(ctx, subctx); + } + + // Request descriptor sets + if (qx_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); + } + if (qy_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); + } + if (quantize_y) { + ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); + } + if (split_k > 1) { + ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1); + } + } + + vk_buffer d_D = dst_buf_ctx->dev_buffer; + const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + GGML_ASSERT(d_D != nullptr); + GGML_ASSERT(d_D->size >= d_buf_offset + d_sz); + vk_buffer d_X; + uint64_t x_buf_offset = 0; + vk_buffer d_Y; + uint64_t y_buf_offset = 0; + if (!src0_uma) { + d_Qx = src0_buf_ctx->dev_buffer; + qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; + GGML_ASSERT(d_Qx != nullptr); + } + if (!src1_uma) { + d_Qy = src1_buf_ctx->dev_buffer; + qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; + GGML_ASSERT(d_Qy != nullptr); + } + if (qx_needs_dequant) { + d_X = ctx->prealloc_x; + GGML_ASSERT(d_X->size >= x_sz); + } else { + d_X = d_Qx; + x_buf_offset = qx_buf_offset; + GGML_ASSERT(qx_sz == x_sz); + } + if (qy_needs_dequant) { + d_Y = ctx->prealloc_y; + GGML_ASSERT(d_Y->size >= y_sz); + } else if (quantize_y) { + d_Y = ctx->prealloc_y; + GGML_ASSERT(d_Y->size >= CEIL_DIV(y_sz, 144) * 144); + } else { + d_Y = d_Qy; + y_buf_offset = qy_buf_offset; + GGML_ASSERT(qy_sz == y_sz); + } + + if (x_non_contig || qx_needs_dequant) { + if (ctx->prealloc_x_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + } + + if (x_non_contig) { + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0)); + } else if (qx_needs_dequant) { + const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; + ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_X, 0, x_sz } }, pc, { (uint32_t)(x_ne), 1, 1}); + ggml_vk_sync_buffers(ctx, subctx); + } + if (y_non_contig) { + if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || + ctx->prealloc_y_last_tensor_used != src1 || + ctx->prealloc_y_last_decode_vector_staging) { + if (ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0)); + ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); + ctx->prealloc_y_last_tensor_used = src1; + ctx->prealloc_y_last_decode_vector_staging = false; + } + } + if (quantize_y) { + if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() || + ctx->prealloc_y_last_tensor_used != src1 || + ctx->prealloc_y_last_decode_vector_staging) { + if (ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne); + ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); + ctx->prealloc_y_last_tensor_used = src1; + ctx->prealloc_y_last_decode_vector_staging = false; + } + } + + uint32_t stride_batch_x = ne00*ne01; + uint32_t stride_batch_y = ne10*ne11; + + if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { + stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); + } + + if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant && !quantize_y) { + stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); + } + + // compute + ggml_vk_matmul( + ctx, subctx, pipeline, + { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, + ggml_vk_subbuffer(ctx, d_D, d_buf_offset), { ctx->prealloc_split_k, 0, d_sz * split_k }, + ne01, ne11, ne10, + ne10, ne10, stride_d, stride_batch_x, stride_batch_y, stride_batch_d, + split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n + ); // NOLINT + + if (x_non_contig || qx_needs_dequant) { + ctx->prealloc_x_need_sync = true; + } + if (y_non_contig || quantize_y) { + ctx->prealloc_y_need_sync = true; + } +} + +static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_t n, uint32_t k, ggml_type src0_type) { + if (device->mmvq_mode == 1) { + return true; + } else if (device->mmvq_mode == -1) { + return false; + } + + // q6_k only has 2-byte alignment which makes it somewhat problematic, + // using MMVQ is only a win on Intel. + bool mmvq_q6 = device->vendor_id == VK_VENDOR_ID_INTEL; + if (src0_type == GGML_TYPE_Q6_K && !mmvq_q6) { + return false; + } + + // MMVQ is generally good for batches + if (n > 1) { + return true; + } + + // Quantization overhead is not worth it for small k + switch (device->vendor_id) { + case VK_VENDOR_ID_NVIDIA: + if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_IQ1_S || src0_type == GGML_TYPE_IQ1_M) { + return true; + } + + if (k <= 4096) { + return false; + } + + switch (src0_type) { + case GGML_TYPE_MXFP4: + case GGML_TYPE_Q8_0: + return device->architecture == vk_device_architecture::NVIDIA_PRE_TURING; + default: + return true; + } + case VK_VENDOR_ID_AMD: + if (k < 2048) { + return false; + } + + switch (src0_type) { + case GGML_TYPE_Q8_0: + return device->architecture == vk_device_architecture::AMD_GCN; + default: + return true; + } + case VK_VENDOR_ID_INTEL: + if (device->architecture == vk_device_architecture::INTEL_XE2) { + if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q6_K) { + return true; + } + } + + if (device->driver_id == vk::DriverId::eIntelProprietaryWindows) { + // Intel Windows proprietary driver MMVQ performance for !Q2/Q3/Q6 is worse than fp16, + // see https://github.com/ggml-org/llama.cpp/issues/17628 and + // https://github.com/ggml-org/llama.cpp/pull/23056 + return false; + } + + if (k < 2048) { + return false; + } + + switch (src0_type) { + // From tests on A770 Linux, may need more tuning + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q5_1: + return false; + default: + return true; + } + default: + return true; + } + + GGML_UNUSED(m); +} + +static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << ")),)"); + GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT + GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT + + const uint64_t ne00 = src0->ne[0]; + const uint64_t ne01 = src0->ne[1]; + const uint64_t ne02 = src0->ne[2]; + const uint64_t ne03 = src0->ne[3]; + + const uint64_t ne10 = src1->ne[0]; + const uint64_t ne11 = src1->ne[1]; + const uint64_t ne12 = src1->ne[2]; + const uint64_t ne13 = src1->ne[3]; + + const uint64_t ne20 = dst->ne[0]; + const uint64_t ne21 = dst->ne[1]; + // const uint64_t ne22 = dst->ne[2]; + // const uint64_t ne23 = dst->ne[3]; + + const uint64_t r2 = ne12 / ne02; + const uint64_t r3 = ne13 / ne03; + + // batch_n indicates that we need to compute a few vector results, and this assumes + // ne12 and ne13 are 1. It overloads the batch_strides to hold the row strides. + GGML_ASSERT(ne11 == 1 || ne12 * ne13 == 1); + bool batch_n = ne11 > 1; + + const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); + const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); + + const bool f16_f32_kernel = src1->type == GGML_TYPE_F32; + bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0 && ggml_vk_should_use_mmvq(ctx->device, ne01, ne11, ne10, src0->type); + + vk_pipeline to_fp16_vk_0 = nullptr; + vk_pipeline to_fp16_vk_1 = nullptr; + if (x_non_contig) { + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type); + } + if (y_non_contig) { + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type); + } else { + to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); + } + + // Check for mmq first + vk_pipeline dmmv = quantize_y ? ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, GGML_TYPE_Q8_1, ne11, ne20, ne00) : nullptr; + vk_pipeline to_q8_1 = nullptr; + + if (dmmv == nullptr) { + // Fall back to f16 dequant mul mat + dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type, ne11, ne20, ne00); + quantize_y = false; + } + + if (quantize_y) { + to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); + } + + if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { + dmmv = ggml_vk_get_64b_indexing_pipeline(ctx, dmmv); + } + + const bool qx_needs_dequant = x_non_contig; + const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig); + + // Not implemented + GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT + + GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT + GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT + GGML_ASSERT(dmmv != nullptr); + + const uint64_t x_ne = ggml_nelements(src0); + const uint64_t y_ne = ggml_nelements(src1); + + const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); + const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; + const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : + (f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); + + { + if ( + (qx_needs_dequant && x_sz > ctx->device->properties.limits.maxStorageBufferRange) || + (qy_needs_dequant && y_sz > ctx->device->properties.limits.maxStorageBufferRange)) { + GGML_ABORT("Requested preallocation size is too large"); + } + if (qx_needs_dequant && ctx->prealloc_size_x < x_sz) { + ctx->prealloc_size_x = x_sz; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz) { + ctx->prealloc_size_y = y_sz; + ggml_vk_preallocate_buffers(ctx, subctx); + } + + // Request descriptor sets + if (qx_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); + } + if (qy_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); + } + if (quantize_y) { + ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); + } + } + + vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]); + vk_subbuffer d_Qx = ggml_vk_tensor_subbuffer(ctx, src0); + vk_subbuffer d_Qy = ggml_vk_tensor_subbuffer(ctx, src1); + vk_subbuffer d_X, d_Y; + + if (qx_needs_dequant) { + d_X = { ctx->prealloc_x, 0, ctx->prealloc_x->size }; + } else { + d_X = d_Qx; + GGML_ASSERT(qx_sz == x_sz); + } + if (qy_needs_dequant || quantize_y) { + d_Y = { ctx->prealloc_y, 0, ctx->prealloc_y->size }; + } else { + d_Y = d_Qy; + } + + if (x_non_contig) { + if (ctx->prealloc_x_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + + GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, d_Qx, d_X); + } + if (y_non_contig) { + GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); + if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || + ctx->prealloc_y_last_tensor_used != src1 || + ctx->prealloc_y_last_decode_vector_staging) { + if (ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, d_Qy, d_Y); + ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); + ctx->prealloc_y_last_tensor_used = src1; + ctx->prealloc_y_last_decode_vector_staging = false; + } + } + if (quantize_y) { + if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() || + ctx->prealloc_y_last_tensor_used != src1 || + ctx->prealloc_y_last_decode_vector_staging) { + if (ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + ggml_vk_quantize_q8_1(ctx, subctx, d_Qy, d_Y, y_ne); + ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); + ctx->prealloc_y_last_tensor_used = src1; + ctx->prealloc_y_last_decode_vector_staging = false; + } + } + + // For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride + uint32_t stride_batch_x = batch_n ? 0 : ne00*ne01; + uint32_t stride_batch_y = batch_n ? ne10 : (ne10*ne11); + uint32_t stride_batch_d = batch_n ? ne20 : (ne20*ne21); + + if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { + stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); + } + + if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) { + stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); + } + + const uint32_t max_groups_x = ctx->device->properties.limits.maxComputeWorkGroupCount[0]; + + uint32_t groups_x = ne01; + uint32_t groups_z = 1; + + if (ne01 > max_groups_x) { + groups_z = 64; + groups_x = CEIL_DIV(groups_x, groups_z); + } + + uint32_t fusion_flags = 0; + + vk_subbuffer d_F0 = d_D; + if (ctx->num_additional_fused_ops > 0) { + const ggml_tensor * add = cgraph->nodes[node_idx + 1]; + const ggml_tensor * bias = add->src[0] == dst ? add->src[1] : add->src[0]; + + d_F0 = ggml_vk_tensor_subbuffer(ctx, bias); + fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS0; + } + + vk_subbuffer d_F1 = d_D; + if (ctx->num_additional_fused_ops == 2) { + const ggml_tensor * add = cgraph->nodes[node_idx + 2]; + const ggml_tensor * bias = add->src[0] == cgraph->nodes[node_idx + 1] ? add->src[1] : add->src[0]; + + d_F1 = ggml_vk_tensor_subbuffer(ctx, bias); + fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS1; + } + + ggml_pipeline_request_descriptor_sets(ctx, dmmv, CEIL_DIV(ne12 * ne13, ctx->device->properties.limits.maxComputeWorkGroupCount[1])); + + uint32_t base_work_group_y = 0; + while (base_work_group_y < ne12 * ne13) { + + uint32_t groups_y = std::min((uint32_t)(ne12 * ne13) - base_work_group_y, ctx->device->properties.limits.maxComputeWorkGroupCount[1]); + const vk_mat_vec_push_constants pc = { + (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01, + stride_batch_x, stride_batch_y, stride_batch_d, + fusion_flags, base_work_group_y, + (uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3, + }; + ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, + { + d_X, + d_Y, + d_D, + d_F0, + d_F1, + }, + pc, { groups_x, groups_y, groups_z }); + base_work_group_y += groups_y; + } + + if (x_non_contig) { + ctx->prealloc_x_need_sync = true; + } + if (y_non_contig || quantize_y) { + ctx->prealloc_y_need_sync = true; + } +} + +static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << "))"); + GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); + GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT + GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const uint64_t ne00 = src0->ne[0]; + const uint64_t ne01 = src0->ne[1]; + const uint64_t ne02 = src0->ne[2]; + // const uint64_t ne03 = src0->ne[3]; + + //const uint64_t ne10 = src1->ne[0]; + const uint64_t ne11 = src1->ne[1]; + const uint64_t ne12 = src1->ne[2]; + // const uint64_t ne13 = src1->ne[3]; + + GGML_ASSERT(ne11 == 1); + + // With grouped query attention there are > 1 Q matrices per K, V matrix. + uint32_t gqa_ratio = (uint32_t)ne12 / (uint32_t)ne02; + if (gqa_ratio > 8 || gqa_ratio == 0 || ne12 != ne02 * gqa_ratio) { + gqa_ratio = 1; + } + + vk_pipeline pipeline = ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1]; + + if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { + pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); + } + + { + // Request descriptor sets + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + } + + vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops], true); + vk_subbuffer d_Qx = ggml_vk_tensor_subbuffer(ctx, src0); + vk_subbuffer d_Qy = ggml_vk_tensor_subbuffer(ctx, src1, true); + + vk_subbuffer d_F0 = d_D; + + uint32_t fusion_flags = 0; + + if (ctx->num_additional_fused_ops > 0) { + const ggml_tensor * add = cgraph->nodes[node_idx + 1]; + const ggml_tensor * bias = add->src[0] == dst ? add->src[1] : add->src[0]; + + d_F0 = ggml_vk_tensor_subbuffer(ctx, bias); + fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS0; + } + + vk_subbuffer d_F1 = d_D; + if (ctx->num_additional_fused_ops > 1) { + const ggml_tensor * bias = cgraph->nodes[node_idx + 2]->src[1]; + + d_F1 = ggml_vk_tensor_subbuffer(ctx, bias); + fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS1; + } + + // compute + + vk_mat_vec_p021_push_constants pc = { + (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, + 0, 0, fusion_flags + }; + + init_pushconst_tensor_offsets(ctx, pc, src0, src1, nullptr, nullptr, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]); + + uint32_t workgroups_z = (uint32_t)ne12; + // When gqa_ratio > 1, each invocation does multiple rows and we can launch fewer workgroups + if (gqa_ratio > 1) { + workgroups_z /= gqa_ratio; + } + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + { + d_Qx, + d_Qy, + d_D, + d_F0, + d_F1, + }, pc, { 1, (uint32_t)ne01, workgroups_z }); +} + +static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << "))"); + GGML_ASSERT(!ggml_is_transposed(src0)); + GGML_ASSERT(!ggml_is_transposed(src1)); + GGML_ASSERT(!ggml_is_permuted(src0)); + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const uint64_t ne00 = src0->ne[0]; + const uint64_t ne01 = src0->ne[1]; + const uint64_t ne02 = src0->ne[2]; + const uint64_t ne03 = src0->ne[3]; + + const uint64_t nb01 = src0->nb[1]; + const uint64_t nb02 = src0->nb[2]; + + const uint64_t nb12 = src1->nb[2]; + + // const uint64_t ne10 = src1->ne[0]; + const uint64_t ne11 = src1->ne[1]; + const uint64_t ne12 = src1->ne[2]; + // const uint64_t ne13 = src1->ne[3]; + + const uint32_t nb03 = (uint32_t)(src0->nb[3] / sizeof(ggml_fp16_t)); + const uint32_t nb13 = (uint32_t)(src1->nb[3] / sizeof(float)); + const uint32_t nb23 = (uint32_t)(dst->nb[3] / sizeof(float)); + + GGML_ASSERT(ne11 == 1); + GGML_ASSERT(src0->ne[3] == src1->ne[3]); // checked in supports_op + + const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t); + const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t); + const uint32_t channel_stride_y = nb12 / sizeof(float); + + vk_pipeline pipeline = ctx->device->pipeline_mul_mat_vec_nc_f16_f32; + if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { + pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); + } + + { + // Request descriptor sets + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + } + + vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops], true); + vk_subbuffer d_Qx = ggml_vk_tensor_subbuffer(ctx, src0); + vk_subbuffer d_Qy = ggml_vk_tensor_subbuffer(ctx, src1, true); + vk_subbuffer d_F0 = d_D; + + uint32_t fusion_flags = 0; + + if (ctx->num_additional_fused_ops > 0) { + const ggml_tensor * add = cgraph->nodes[node_idx + 1]; + const ggml_tensor * bias = add->src[0] == dst ? add->src[1] : add->src[0]; + + d_F0 = ggml_vk_tensor_subbuffer(ctx, bias); + fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS0; + } + + vk_subbuffer d_F1 = d_D; + if (ctx->num_additional_fused_ops > 1) { + const ggml_tensor * bias = cgraph->nodes[node_idx + 2]->src[1]; + + d_F1 = ggml_vk_tensor_subbuffer(ctx, bias); + fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS1; + } + + // compute + vk_mat_vec_nc_push_constants pc = { + (uint32_t)ne00, (uint32_t)ne01, + row_stride_x, channel_stride_x, channel_stride_y, + (uint32_t)(ne12 / ne02), (uint32_t)ne12, + 0, 0, + nb03, nb13, nb23, fusion_flags + }; + + init_pushconst_tensor_offsets(ctx, pc, src0, src1, nullptr, nullptr, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]); + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + { + d_Qx, + d_Qy, + d_D, + d_F0, + d_F1, + }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 }); +} + +void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; + VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")"); + + // Handle huge A matrix by splitting the M dimensions. This works well for convolution use cases + // where the M dimension is very large. + // Split_k doesn't work with M splitting. + // This only supports batchsize == 1. + const size_t nbytes = ggml_nbytes(src0); + const bool needs_split = dst->ne[2] == 1 && dst->ne[3] == 1 && nbytes > ctx->device->properties.limits.maxStorageBufferRange; + if (needs_split) { + // Choose the number of rows that can fit (and divide by two, to allow for any additional offsets) + const uint32_t M_split = ctx->device->properties.limits.maxStorageBufferRange / (2 * src0->nb[1]); + uint32_t m_offset = 0; + while (m_offset < dst->ne[0]) { + const uint32_t cur_M_size = std::min(M_split, (uint32_t)(dst->ne[0] - m_offset)); + ggml_tensor dst2 = *dst; + ggml_tensor src02 = *src0; + + dst2.view_src = dst->view_src ? dst->view_src : dst; + src02.view_src = src0->view_src ? src0->view_src : src0; + + dst2.view_offs += m_offset * dst->nb[0]; + src02.view_offs += m_offset * src0->nb[1]; + dst2.ne[0] = cur_M_size; + src02.ne[1] = cur_M_size; + + ggml_vk_mul_mat_q_f16(ctx, subctx, &src02, src1, &dst2, true); + + m_offset += cur_M_size; + } + } else if (ggml_vk_can_use_fwht(ctx, src1, dst)) { + ggml_vk_fwht(ctx, subctx, src1, dst); + } else if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1 && + // detect 0213 permutation, and batch size of 1 + src0->nb[0] <= src0->nb[2] && + src0->nb[2] <= src0->nb[1] && + src0->nb[1] <= src0->nb[3] && + src1->nb[0] <= src1->nb[2] && + src1->nb[2] <= src1->nb[1] && + src1->nb[1] <= src1->nb[3] && + src0->ne[3] == 1 && + src1->ne[3] == 1 && + src0->ne[1] <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] && + src1->ne[2] <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]) { + ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, cgraph, node_idx); + } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 && + !ggml_is_permuted(src0) && !ggml_is_permuted(src1) && + src0->ne[3] <= ctx->device->properties.limits.maxComputeWorkGroupCount[0] && + src0->ne[1] <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] && + src1->ne[2] <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]) { + ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, cgraph, node_idx); + // mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four) + // when ne12 and ne13 are one. + } else if ((dst->ne[1] == 1 || (dst->ne[1] <= mul_mat_vec_max_cols && src1->ne[2] * src1->ne[3] == 1)) && + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || ggml_is_quantized(src0->type))) { + ggml_vk_mul_mat_vec_q_f16(ctx, subctx, cgraph, node_idx); + } else { + ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, false); + } +} + +static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { + VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; + std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); + GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT + GGML_ASSERT(ids->type == GGML_TYPE_I32); + + const uint64_t ne00 = src0->ne[0]; + const uint64_t ne01 = src0->ne[1]; + const uint64_t ne02 = src0->ne[2]; + // const uint64_t ne03 = src0->ne[3]; + + const uint64_t ne10 = src1->ne[0]; + const uint64_t ne11 = src1->ne[1]; + const uint64_t ne12 = src1->ne[2]; + const uint64_t ne13 = src1->ne[3]; + + const uint64_t nei0 = ids->ne[0]; + const uint64_t nei1 = ids->ne[1]; + + const uint32_t nbi0 = ids->nb[0]; + const uint32_t nbi1 = ids->nb[1]; + const uint32_t nbi2 = ids->nb[2]; + + const uint64_t ne20 = dst->ne[0]; + const uint64_t ne21 = dst->ne[1]; + // const uint64_t ne22 = dst->ne[2]; + // const uint64_t ne23 = dst->ne[3]; + + const uint64_t n_as = ne02; + + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; + ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context; + + vk_buffer d_Qx = nullptr; + size_t qx_buf_offset = 0; + vk_buffer d_Qy = nullptr; + size_t qy_buf_offset = 0; + vk_buffer d_ids = nullptr; + size_t ids_buf_offset = 0; + + bool src0_uma = false; + bool src1_uma = false; + bool ids_uma = false; + + if (ctx->device->uma) { + ggml_vk_host_get(ctx->device, src0->data, d_Qx, qx_buf_offset); + ggml_vk_host_get(ctx->device, src1->data, d_Qy, qy_buf_offset); + ggml_vk_host_get(ctx->device, ids->data, d_ids, ids_buf_offset); + src0_uma = d_Qx != nullptr; + src1_uma = d_Qy != nullptr; + ids_uma = d_ids != nullptr; + } + + // Reformat and convert to fp16 if non-contiguous, or for coopmat2 for better perf + const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) || + !ggml_vk_dim01_contiguous(src0); + // If src0 is BF16, try to use a BF16 x BF16 multiply + ggml_type f16_type = src0->type == GGML_TYPE_BF16 ? GGML_TYPE_BF16 : GGML_TYPE_F16; +#if defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT) + // B must already be, or be convertible to, the matmul B type used by this path. + const bool y_decode_vector_supported = ctx->device->coopmat2_decode_vector && + (f16_type != GGML_TYPE_BF16 || ctx->device->coopmat2_bf16_support) && + (src1->type == GGML_TYPE_F32 || src1->type == f16_type); + // If B is copied to prealloc_y, we can choose a 4-element-aligned row stride. + const bool y_decode_vector_uses_prealloc = !ggml_vk_dim01_contiguous(src1) || src1->type != f16_type; + // Direct B reads are safe only if row starts and the original buffer offset are 4-element aligned. + const bool y_decode_vector_aligned = + (ne10 % 4 == 0) && + (y_decode_vector_uses_prealloc || get_misalign_bytes(ctx, src1) % (4 * ggml_type_size(src1->type)) == 0); + // Stage B only when decode-vector is available and direct B reads would be misaligned. + const bool y_decode_vector_staging = y_decode_vector_supported && !y_decode_vector_aligned; +#else + const bool y_decode_vector_staging = false; +#endif + const bool y_non_contig = y_decode_vector_staging || + (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) || + (src0->type == GGML_TYPE_BF16 && src1->type != GGML_TYPE_BF16) || + !ggml_vk_dim01_contiguous(src1); + + const uint32_t y_staged_row_stride = y_decode_vector_staging ? (uint32_t)ggml_vk_align_size(ne10, 4) : (uint32_t)ne10; + + const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; + + bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0; + + // Check for mmq first + vk_matmul_pipeline mmp = quantize_y ? ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, GGML_TYPE_Q8_1, (ggml_prec)dst->op_params[0]) : nullptr; + + if (mmp == nullptr) { + // Fall back to f16 dequant mul mat + mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, y_non_contig ? f16_type : src1->type, (ggml_prec)dst->op_params[0]); + quantize_y = false; + } + + const bool qx_needs_dequant = mmp == nullptr || x_non_contig; + const bool qy_needs_dequant = !quantize_y && ((src1->type != f16_type && !y_f32_kernel) || y_non_contig); + + if (qx_needs_dequant) { + // Fall back to dequant + f16 mulmat + mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, f16_type, y_f32_kernel ? GGML_TYPE_F32 : f16_type, (ggml_prec)dst->op_params[0]); + } + + // Not implemented + GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT + + const ggml_type effective_src1_type = quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type); + + const uint32_t kpad = quantize_y ? 0 : ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1, qx_needs_dequant ? f16_type : src0->type, effective_src1_type)); + const bool aligned = !quantize_y && ne10 == kpad && ne01 > 8 && nei1 > 8; + + vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? f16_type : src0->type, effective_src1_type); + + if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { + pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); + } + // Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking + uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11; + const uint64_t x_ne = ggml_nelements(src0); + const uint64_t y_ne = (uint64_t)y_staged_row_stride * padded_n * ne12 * ne13; + const uint64_t d_ne = ggml_nelements(dst); + + const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); + const uint64_t qy_sz = ggml_type_size(src1->type) * ggml_nelements(src1) / ggml_blck_size(src1->type); + const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne; + const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); + const uint64_t ids_sz = nbi2; + const uint64_t d_sz = sizeof(float) * d_ne; + + vk_pipeline to_fp16_vk_0 = nullptr; + vk_pipeline to_fp16_vk_1 = nullptr; + vk_pipeline to_q8_1 = nullptr; + + auto make_y_staged_dst = [&]() { + ggml_tensor y_staged_dst = *src1; + y_staged_dst.type = f16_type; + y_staged_dst.nb[0] = ggml_type_size(f16_type); + y_staged_dst.nb[1] = y_staged_dst.nb[0] * y_staged_row_stride; + y_staged_dst.nb[2] = y_staged_dst.nb[1] * padded_n; + y_staged_dst.nb[3] = y_staged_dst.nb[2] * y_staged_dst.ne[2]; + return y_staged_dst; + }; + + if (x_non_contig) { + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, f16_type); + } else { + to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); + } + if (y_non_contig) { + ggml_tensor y_staged_dst; + const ggml_tensor * y_staged_dst_ptr = nullptr; + if (y_decode_vector_staging) { + y_staged_dst = make_y_staged_dst(); + y_staged_dst_ptr = &y_staged_dst; + } + + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, y_staged_dst_ptr, f16_type); + } else { + to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); + } + GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT + GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT + + if (quantize_y) { + to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); + } + vk_pipeline count_experts = ctx->device->pipeline_count_experts; + + uint32_t expert_count_size = sizeof(uint32_t) * n_as; + + { + if ( + (qx_needs_dequant && x_sz > ctx->device->properties.limits.maxStorageBufferRange) || + (qy_needs_dequant && y_sz > ctx->device->properties.limits.maxStorageBufferRange)) { + GGML_ABORT("Requested preallocation size is too large"); + } + if (qx_needs_dequant && ctx->prealloc_size_x < x_sz) { + ctx->prealloc_size_x = x_sz; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz) { + ctx->prealloc_size_y = y_sz; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if (ctx->prealloc_size_split_k < expert_count_size) { + ctx->prealloc_size_split_k = expert_count_size; + ggml_vk_preallocate_buffers(ctx, subctx); + } + + // Request descriptor sets + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + if (qx_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); + } + if (qy_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); + } + if (quantize_y) { + ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); + } + ggml_pipeline_request_descriptor_sets(ctx, count_experts, 1); + } + + vk_buffer d_D = dst_buf_ctx->dev_buffer; + const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + GGML_ASSERT(d_D != nullptr); + vk_buffer d_X; + uint64_t x_buf_offset = 0; + vk_buffer d_Y; + uint64_t y_buf_offset = 0; + if (!src0_uma) { + d_Qx = src0_buf_ctx->dev_buffer; + qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; + GGML_ASSERT(d_Qx != nullptr); + } + if (!src1_uma) { + d_Qy = src1_buf_ctx->dev_buffer; + qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; + GGML_ASSERT(d_Qy != nullptr); + } + if (!ids_uma) { + d_ids = ids_buf_ctx->dev_buffer; + ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs; + GGML_ASSERT(d_ids != nullptr); + } + if (qx_needs_dequant) { + d_X = ctx->prealloc_x; + GGML_ASSERT(d_X->size >= x_sz); + } else { + d_X = d_Qx; + x_buf_offset = qx_buf_offset; + GGML_ASSERT(qx_sz == x_sz); + } + if (qy_needs_dequant) { + d_Y = ctx->prealloc_y; + GGML_ASSERT(d_Y->size >= y_sz); + } else if (quantize_y) { + d_Y = ctx->prealloc_y; + GGML_ASSERT(d_Y->size >= CEIL_DIV(y_sz, 144) * 144); + } else { + d_Y = d_Qy; + y_buf_offset = qy_buf_offset; + GGML_ASSERT(qy_sz == y_sz); + } + + if (x_non_contig || qx_needs_dequant) { + if (ctx->prealloc_x_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + } + // Count how many times each expert is used + vk_subbuffer expert_count_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0); + if (ctx->prealloc_split_k_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + { + const std::vector pc = { (uint32_t)nei0, + (uint32_t)nei1, + (uint32_t)(nbi0 / ggml_type_size(ids->type)), + (uint32_t)(nbi1 / ggml_type_size(ids->type)), + (uint32_t)(get_misalign_bytes(ctx, ids) / ggml_type_size(ids->type)) }; + ggml_vk_dispatch_pipeline(ctx, subctx, count_experts, + { vk_subbuffer{ d_ids, ids_buf_offset, ids_sz }, expert_count_buf }, pc, { (uint32_t)n_as, 1, 1}); + } + + if (x_non_contig) { + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0)); + } else if (qx_needs_dequant) { + const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; + ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, + { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_X, 0, x_sz } }, pc, { (uint32_t)x_ne, 1, 1}); + } + if (y_non_contig) { + if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || + ctx->prealloc_y_last_tensor_used != src1 || + ctx->prealloc_y_last_decode_vector_staging != y_decode_vector_staging) { + if (ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + if (y_decode_vector_staging) { + const ggml_tensor y_staged_dst = make_y_staged_dst(); + const uint32_t y_staged_dst_type_size = ggml_type_size(y_staged_dst.type); + ggml_vk_cpy_to_strided( + ctx, subctx, to_fp16_vk_1, src1, + ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), + (uint32_t)(y_staged_dst.nb[0] / y_staged_dst_type_size), + (uint32_t)(y_staged_dst.nb[1] / y_staged_dst_type_size), + (uint32_t)(y_staged_dst.nb[2] / y_staged_dst_type_size), + (uint32_t)(y_staged_dst.nb[3] / y_staged_dst_type_size)); + } else { + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0)); + } + ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); + ctx->prealloc_y_last_tensor_used = src1; + ctx->prealloc_y_last_decode_vector_staging = y_decode_vector_staging; + } + } + if (quantize_y) { + if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() || + ctx->prealloc_y_last_tensor_used != src1 || + ctx->prealloc_y_last_decode_vector_staging) { + if (ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne); + ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); + ctx->prealloc_y_last_tensor_used = src1; + ctx->prealloc_y_last_decode_vector_staging = false; + } + } + ggml_vk_sync_buffers(ctx, subctx); + + uint32_t stride_batch_x = ne00*ne01; + uint32_t stride_b_y = y_decode_vector_staging ? y_staged_row_stride : ne10; + uint32_t stride_batch_y = y_decode_vector_staging ? y_staged_row_stride * padded_n : ne10*ne11; + + if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { + stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); + } + + if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant && !quantize_y) { + stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); + } + + // compute + ggml_vk_matmul_id( + ctx, subctx, pipeline, + { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, + { d_D, d_buf_offset, d_sz }, { d_ids, ids_buf_offset, ids_sz }, expert_count_buf, + ne01, ne21, ne10, ne10, stride_b_y, ne01, + stride_batch_x, stride_batch_y, ne20*ne21, + n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11, padded_n + ); // NOLINT + + if (x_non_contig || qx_needs_dequant) { + ctx->prealloc_x_need_sync = true; + } + if (y_non_contig || quantize_y) { + ctx->prealloc_y_need_sync = true; + } + ctx->prealloc_split_k_need_sync = true; +} + +static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; + ggml_tensor * ids = dst->src[2]; + VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; + std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << "))"); + GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT + GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT + GGML_ASSERT(ids->type == GGML_TYPE_I32); + + const uint64_t ne00 = src0->ne[0]; + const uint64_t ne01 = src0->ne[1]; + // const uint64_t ne02 = src0->ne[2]; + // const uint64_t ne03 = src0->ne[3]; + + const uint64_t ne10 = src1->ne[0]; + const uint64_t ne11 = src1->ne[1]; + const uint64_t ne12 = src1->ne[2]; + // const uint64_t ne13 = src1->ne[3]; + + const uint64_t nei0 = ids->ne[0]; + const uint64_t nei1 = ids->ne[1]; + const uint32_t nbi1 = (uint32_t)(ids->nb[1] / sizeof(int)); + + const uint64_t ne20 = dst->ne[0]; + const uint64_t ne21 = dst->ne[1]; + // const uint64_t ne22 = dst->ne[2]; + // const uint64_t ne23 = dst->ne[3]; + + const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); + const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); + + const bool f16_f32_kernel = src1->type == GGML_TYPE_F32; + bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0 && ggml_vk_should_use_mmvq(ctx->device, ne01, ne12, ne10, src0->type); + + vk_pipeline to_fp16_vk_0 = nullptr; + vk_pipeline to_fp16_vk_1 = nullptr; + if (x_non_contig) { + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type); + } + if (y_non_contig) { + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type); + } else { + to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); + } + + // Check for mmq first + vk_pipeline dmmv = quantize_y ? ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, GGML_TYPE_Q8_1, ne20, ne00) : nullptr; + vk_pipeline to_q8_1 = nullptr; + + if (dmmv == nullptr) { + // Fall back to f16 dequant mul mat + dmmv = ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, src1->type, ne20, ne00); + quantize_y = false; + } + + if (quantize_y) { + to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); + } + + const bool qx_needs_dequant = x_non_contig; + const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig); + + if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { + dmmv = ggml_vk_get_64b_indexing_pipeline(ctx, dmmv); + } + + // Not implemented + GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT + GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT + GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT + GGML_ASSERT(dmmv != nullptr); + + const uint64_t x_ne = ggml_nelements(src0); + const uint64_t y_ne = ggml_nelements(src1); + + const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); + const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; + const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : + (f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); + + { + if ( + (qx_needs_dequant && x_sz > ctx->device->properties.limits.maxStorageBufferRange) || + (qy_needs_dequant && y_sz > ctx->device->properties.limits.maxStorageBufferRange)) { + GGML_ABORT("Requested preallocation size is too large"); + } + if (qx_needs_dequant && ctx->prealloc_size_x < x_sz) { + ctx->prealloc_size_x = x_sz; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz) { + ctx->prealloc_size_y = y_sz; + ggml_vk_preallocate_buffers(ctx, subctx); + } + + // Request descriptor sets + if (qx_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); + } + if (qy_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); + } + if (quantize_y) { + ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); + } + ggml_pipeline_request_descriptor_sets(ctx, dmmv, nei1); + } + + vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]); + vk_subbuffer d_Qx = ggml_vk_tensor_subbuffer(ctx, src0); + vk_subbuffer d_Qy = ggml_vk_tensor_subbuffer(ctx, src1); + vk_subbuffer d_ids = ggml_vk_tensor_subbuffer(ctx, ids); + vk_subbuffer d_F0 = d_D; + vk_subbuffer d_X, d_Y; + + if (qx_needs_dequant) { + d_X = { ctx->prealloc_x, 0, ctx->prealloc_x->size }; + } else { + d_X = d_Qx; + } + if (qy_needs_dequant || quantize_y) { + d_Y = { ctx->prealloc_y, 0, ctx->prealloc_y->size }; + } else { + d_Y = d_Qy; + } + + if (x_non_contig) { + if (ctx->prealloc_x_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + } + + if (x_non_contig) { + GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, d_Qx, d_X); + } + if (y_non_contig) { + GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); + if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || + ctx->prealloc_y_last_tensor_used != src1 || + ctx->prealloc_y_last_decode_vector_staging) { + if (ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, d_Qy, d_Y); + ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); + ctx->prealloc_y_last_tensor_used = src1; + ctx->prealloc_y_last_decode_vector_staging = false; + } + } + if (quantize_y) { + if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() || + ctx->prealloc_y_last_tensor_used != src1 || + ctx->prealloc_y_last_decode_vector_staging) { + if (ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + ggml_vk_quantize_q8_1(ctx, subctx, d_Qy, d_Y, y_ne); + ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); + ctx->prealloc_y_last_tensor_used = src1; + ctx->prealloc_y_last_decode_vector_staging = false; + } + } + + uint32_t stride_batch_y = ne10*ne11; + + if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) { + stride_batch_y = src1->nb[2] / ggml_type_size(src1->type); + } + + const uint32_t max_groups_x = ctx->device->properties.limits.maxComputeWorkGroupCount[0]; + + uint32_t groups_x = ne01; + uint32_t groups_z = 1; + + if (ne01 > max_groups_x) { + groups_z = 64; + groups_x = CEIL_DIV(groups_x, groups_z); + } + + uint32_t fusion_flags = 0; + + if (ctx->num_additional_fused_ops > 0) { + const ggml_tensor * bias = cgraph->nodes[node_idx + 1]->src[1]; + + d_F0 = ggml_vk_tensor_subbuffer(ctx, bias); + + if (cgraph->nodes[node_idx + 1]->op == GGML_OP_MUL) { + fusion_flags |= MAT_VEC_FUSION_FLAGS_SCALE0; + } else { + GGML_ASSERT(cgraph->nodes[node_idx + 1]->op == GGML_OP_ADD_ID); + fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS0; + } + } + + vk_subbuffer d_F1 = d_D; + if (ctx->num_additional_fused_ops > 1) { + const ggml_tensor * scale = cgraph->nodes[node_idx + 2]->src[1]; + + d_F1 = ggml_vk_tensor_subbuffer(ctx, scale); + fusion_flags |= MAT_VEC_FUSION_FLAGS_SCALE1; + } + + // Loop over the batch dimension + for (uint32_t expert_i1 = 0; expert_i1 < nei1; ++expert_i1) { + const vk_mat_vec_id_push_constants pc = { + (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01, + (uint32_t)(ne00 * ne01), stride_batch_y, (uint32_t)(ne20 * ne21), + fusion_flags, + (uint32_t)nei0, (uint32_t)ne11, expert_i1, nbi1 + }; + ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, + { + d_X, + d_Y, + d_D, + d_F0, + d_F1, + d_ids, + }, + pc, { groups_x, (uint32_t)nei0, groups_z }); + } + + if (x_non_contig) { + ctx->prealloc_x_need_sync = true; + } + if (y_non_contig || quantize_y) { + ctx->prealloc_y_need_sync = true; + } +} + +bool ggml_vk_use_mul_mat_vec_id(const struct ggml_cgraph * cgraph, int node_idx) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src2 = dst->src[2]; + return (src2->ne[1] <= 8) && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)); +} + +void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; + ggml_tensor * src2 = dst->src[2]; + VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")"); + if (ggml_vk_use_mul_mat_vec_id(cgraph, node_idx)) { + ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, cgraph, node_idx); + } else { + ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst); + } +} + diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-operators.cpp b/ggml/src/ggml-vulkan/ggml-vulkan-operators.cpp new file mode 100644 index 0000000000..3707613b47 --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-operators.cpp @@ -0,0 +1,2893 @@ +#include "ggml-vulkan-common.h" + +void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, const vk_subbuffer & in, const vk_subbuffer & out) { + VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; + std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); + + const uint32_t ne = ggml_nelements(tensor); + std::array elements; + + if (ne > 262144) { + elements = { 512, 512, CEIL_DIV(ne, 262144) }; + } else if (ne > 512) { + elements = { 512, CEIL_DIV(ne, 512), 1 }; + } else { + elements = { ne, 1, 1 }; + } + + vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne); + pc.nb10 = 1; + pc.nb11 = (uint32_t)tensor->ne[0]; + pc.nb12 = (uint32_t)(tensor->ne[0] * tensor->ne[1]); + pc.nb13 = (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]); + init_pushconst_fastdiv(pc); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements); + ggml_vk_sync_buffers(ctx, subctx); +} + +static int ggml_vk_fwht_pipeline_idx(int64_t n) { + switch (n) { + case 64: return 0; + case 128: return 1; + case 256: return 2; + case 512: return 3; + default: return -1; + } +} + +bool ggml_vk_can_use_fwht(const ggml_backend_vk_context * ctx, const ggml_tensor * src1, const ggml_tensor * dst) { + if (ctx->num_additional_fused_ops != 0) { + return false; + } + + if (ggml_get_op_params_i32(dst, 1) != GGML_HINT_SRC0_IS_HADAMARD) { + return false; + } + + const int idx = ggml_vk_fwht_pipeline_idx(src1->ne[0]); + if (idx < 0 || ctx->device->pipeline_fwht_f32[idx] == nullptr) { + return false; + } + + if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) { + return false; + } + + if (!ggml_is_contiguous(src1)) { + return false; + } + GGML_ASSERT(ggml_is_contiguous(dst)); + + return true; +} + +void ggml_vk_fwht(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src, ggml_tensor * dst) { + const int idx = ggml_vk_fwht_pipeline_idx(src->ne[0]); + vk_pipeline pipeline = ctx->device->pipeline_fwht_f32[idx]; + + const uint32_t rows_per_workgroup = 4; + const uint32_t n_rows = (uint32_t)ggml_nrows(src); + const uint32_t max_workgroups_x = ctx->device->properties.limits.maxComputeWorkGroupCount[0]; + + const uint32_t total_workgroups = CEIL_DIV(n_rows, rows_per_workgroup); + const uint32_t workgroups_x = std::min(total_workgroups, max_workgroups_x); + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + + const vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src, true); + const vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, true); + + vk_op_fwht_push_constants pc = { + n_rows, + 0, + 0, + 1.0f / std::sqrt((float)src->ne[0]), + }; + init_pushconst_tensor_offsets(ctx, pc, src, nullptr, nullptr, nullptr, dst); + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src_buf, dst_buf }, pc, { workgroups_x, 1, 1 }); +} + +static vk_conv_shapes ggml_vk_conv_select_shape(ggml_backend_vk_context * ctx, uint32_t K, uint32_t NPQ) { + auto n_tiles = [&](vk_conv_shapes s) { + return CEIL_DIV(K, vk_conv_block_sizes[s].K) + * CEIL_DIV(NPQ, vk_conv_block_sizes[s].NPQ); + }; + + // We can't query number of shader cores on Intel, use 32 as a placeholder + // so small convolutions will still choose a smaller tile. + const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32; + + // 128x128 isn't used with cm1 due to shared memory size; fall through to a smaller tile. + bool allow_128x128 = true; +#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + if (!ctx->device->coopmat2 && ctx->device->coopmat_support && ctx->device->coopmat_support_16x16x16_f16acc) { + allow_128x128 = false; + } +#endif + + if (allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) { + return CONV_SHAPE_128x128; + } else if (K <= 32 && n_tiles(CONV_SHAPE_32x256) >= shader_core_count * 2) { + return CONV_SHAPE_32x256; + } else if (K <= 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) { + return CONV_SHAPE_64x128; + } else if (!allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) { + // cm1 fallback for large K when 128x128 isn't available + return CONV_SHAPE_64x128; + } else { + return CONV_SHAPE_64x32; + } +} + +static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * dst, ggml_op op) { + switch (op) { + case GGML_OP_GET_ROWS: + GGML_ASSERT(src1->type == GGML_TYPE_I32); + if (src0->type == GGML_TYPE_I32) { + // i32 src only supports i32 result + GGML_ASSERT(dst->type == GGML_TYPE_I32); + return ctx->device->pipeline_get_rows[src0->type]; + } + if (dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_get_rows[src0->type]; + } + if (dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_get_rows_f32[src0->type]; + } + return nullptr; + case GGML_OP_ACC: + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_acc_f32; + } + return nullptr; + case GGML_OP_SET: + if (src0->type == src1->type && src0->type == dst->type && + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32)) { + return ctx->device->pipeline_set_f32; + } + return nullptr; + case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) || + (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) || + (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16)) { + return nullptr; + } + switch (op) { + case GGML_OP_ADD: + { + if (ctx->num_additional_fused_ops > 0) { + if (ctx->do_add_rms_partials) { + return ctx->device->pipeline_multi_add_rms[ctx->num_additional_fused_ops]; + } else { + return ctx->device->pipeline_multi_add[ctx->num_additional_fused_ops]; + } + } + if (ctx->do_add_rms_partials) { + auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_rms_norepeat : ctx->device->pipeline_add_rms; + return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; + } else { + auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_norepeat : ctx->device->pipeline_add; + return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; + } + } + case GGML_OP_SUB: + { + auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_sub_norepeat : ctx->device->pipeline_sub; + return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; + } + case GGML_OP_MUL: + { + auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_mul_norepeat : ctx->device->pipeline_mul; + return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; + } + case GGML_OP_DIV: + { + auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_div_norepeat : ctx->device->pipeline_div; + return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; + } + default: + break; + } + return nullptr; + case GGML_OP_ADD_ID: + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && src2->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_add_id_f32; + } + return nullptr; + case GGML_OP_CONCAT: { + if (src0->type != src1->type || src0->type != dst->type) { + return nullptr; + } + if (ggml_blck_size(src0->type) != 1) { + return nullptr; + } + const size_t type_size = ggml_type_size(src0->type); + switch (type_size) { + case 1: + return ctx->device->pipeline_concat_i8; + case 2: + return ctx->device->pipeline_concat_i16; + case 4: + return ctx->device->pipeline_concat_i32; + case 8: + return ctx->device->pipeline_concat_i64; + default: + return nullptr; + } + } + case GGML_OP_UPSCALE: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + uint32_t mode = (ggml_get_op_params_i32(dst, 0) & (0xFF | GGML_SCALE_FLAG_ANTIALIAS)); + switch (mode) { + case GGML_SCALE_MODE_NEAREST: + return ctx->device->pipeline_upscale_nearest_f32; + case GGML_SCALE_MODE_BILINEAR: + return ctx->device->pipeline_upscale_bilinear_f32; + case GGML_SCALE_MODE_BICUBIC: + return ctx->device->pipeline_upscale_bicubic_f32; + case GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS: + return ctx->device->pipeline_upscale_bilinear_antialias_f32; + default: + return nullptr; + } + } + return nullptr; + case GGML_OP_SCALE: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_scale_f32; + } + return nullptr; + case GGML_OP_SQR: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_sqr_f32; + } + return nullptr; + case GGML_OP_SQRT: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_sqrt_f32; + } + return nullptr; + case GGML_OP_SIN: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_sin_f32; + } + return nullptr; + case GGML_OP_COS: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_cos_f32; + } + return nullptr; + case GGML_OP_LOG: + if (src0->type == dst->type && + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { + return ctx->device->pipeline_log[dst->type == GGML_TYPE_F16]; + } + return nullptr; + case GGML_OP_TRI: + if (src0->type == dst->type && + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { + return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16]; + } + return nullptr; + case GGML_OP_DIAG: + if (src0->type == dst->type && + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { + return ctx->device->pipeline_diag[dst->type == GGML_TYPE_F16]; + } + return nullptr; + case GGML_OP_CLAMP: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_clamp_f32; + } + return nullptr; + case GGML_OP_PAD: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_pad_f32; + } + return nullptr; + case GGML_OP_ROLL: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_roll_f32; + } + return nullptr; + case GGML_OP_REPEAT: + if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) { + return ctx->device->pipeline_repeat_i32; + } + if (ggml_type_size(src0->type) == 2 && ggml_type_size(dst->type) == 2) { + return ctx->device->pipeline_repeat_i16; + } + return nullptr; + case GGML_OP_REPEAT_BACK: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_repeat_back_f32; + } + return nullptr; + case GGML_OP_CPY: + case GGML_OP_CONT: + case GGML_OP_DUP: + return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type); + case GGML_OP_SET_ROWS: + if (src1->type == GGML_TYPE_I64) { + return ctx->device->pipeline_set_rows_i64[dst->type]; + } else { + return ctx->device->pipeline_set_rows_i32[dst->type]; + } + case GGML_OP_SILU_BACK: + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_silu_back_f32; + } + return nullptr; + case GGML_OP_NORM: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_norm_f32; + } + return nullptr; + case GGML_OP_GROUP_NORM: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_group_norm_f32; + } + return nullptr; + case GGML_OP_RMS_NORM: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + if (ctx->do_add_rms_partials) { + return ctx->num_additional_fused_ops > 0 ? ctx->device->pipeline_rms_norm_mul_partials_f32 : ctx->device->pipeline_rms_norm_partials_f32; + } else { + return ctx->num_additional_fused_ops > 0 ? ctx->device->pipeline_rms_norm_mul_f32 : ctx->device->pipeline_rms_norm_f32; + } + } + return nullptr; + case GGML_OP_RMS_NORM_BACK: + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_rms_norm_back_f32; + } + return nullptr; + case GGML_OP_L2_NORM: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_l2_norm_f32; + } + return nullptr; + case GGML_OP_UNARY: + if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) || + (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) || + (src0->type != dst->type)) { + return nullptr; + } + + switch (ggml_get_unary_op(dst)) { + case GGML_UNARY_OP_EXP: + return ctx->device->pipeline_exp[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_EXPM1: + return ctx->device->pipeline_expm1[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_ELU: + return ctx->device->pipeline_elu[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_SILU: + return ctx->device->pipeline_silu[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_GELU: + return ctx->device->pipeline_gelu[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_GELU_ERF: + return ctx->device->pipeline_gelu_erf[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_GELU_QUICK: + return ctx->device->pipeline_gelu_quick[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_RELU: + return ctx->device->pipeline_relu[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_XIELU: + return ctx->device->pipeline_xielu[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_NEG: + return ctx->device->pipeline_neg[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_TANH: + return ctx->device->pipeline_tanh[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_SIGMOID: + return ctx->device->pipeline_sigmoid[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_HARDSIGMOID: + return ctx->device->pipeline_hardsigmoid[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_HARDSWISH: + return ctx->device->pipeline_hardswish[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_ABS: + return ctx->device->pipeline_abs[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_SOFTPLUS: + return ctx->device->pipeline_softplus[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_STEP: + return ctx->device->pipeline_step[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_ROUND: + return ctx->device->pipeline_round[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_CEIL: + return ctx->device->pipeline_ceil[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_FLOOR: + return ctx->device->pipeline_floor[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_TRUNC: + return ctx->device->pipeline_trunc[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_SGN: + return ctx->device->pipeline_sgn[dst->type == GGML_TYPE_F16]; + default: + break; + } + return nullptr; + case GGML_OP_GLU: + if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) || + (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) || + (src0->type != dst->type)) { + return nullptr; + } + + switch (ggml_get_glu_op(dst)) { + case GGML_GLU_OP_GEGLU: + return ctx->device->pipeline_geglu[dst->type == GGML_TYPE_F16]; + case GGML_GLU_OP_REGLU: + return ctx->device->pipeline_reglu[dst->type == GGML_TYPE_F16]; + case GGML_GLU_OP_SWIGLU: + return ctx->device->pipeline_swiglu[dst->type == GGML_TYPE_F16]; + case GGML_GLU_OP_SWIGLU_OAI: + return ctx->device->pipeline_swiglu_oai[dst->type == GGML_TYPE_F16]; + case GGML_GLU_OP_GEGLU_ERF: + return ctx->device->pipeline_geglu_erf[dst->type == GGML_TYPE_F16]; + case GGML_GLU_OP_GEGLU_QUICK: + return ctx->device->pipeline_geglu_quick[dst->type == GGML_TYPE_F16]; + default: + break; + } + return nullptr; + case GGML_OP_DIAG_MASK_INF: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_diag_mask_inf_f32; + } + return nullptr; + case GGML_OP_SOFT_MAX: + GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); + GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); + + if (ctx->num_additional_fused_ops) { + uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0]))); + GGML_ASSERT(idx < num_topk_moe_pipelines); + // use n_experts from push constant if it's not equal to the power of two spec constant + bool use_push = dst->ne[0] != (1u << idx); + return ctx->device->pipeline_topk_moe[idx][use_push]; + } + + if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) { + return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_wg512 : ctx->device->pipeline_soft_max_f32; + } + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_f16_wg512 : ctx->device->pipeline_soft_max_f32_f16; + } + return nullptr; + case GGML_OP_SOFT_MAX_BACK: + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_soft_max_back_f32; + } + return nullptr; + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + { + const ggml_tensor *rope = ctx->num_additional_fused_ops == 2 ? dst->src[0]->src[0] : dst; + const int mode = ((const int32_t *) rope->op_params)[2]; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; + const bool is_vision = mode == GGML_ROPE_TYPE_VISION; + + if (is_neox) { + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_rope_neox_f32; + } + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_rope_neox_f32_f16; + } + if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_rope_neox_f16; + } + } else if (is_mrope && !is_vision) { + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_rope_multi_f32; + } + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_rope_multi_f32_f16; + } + if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_rope_multi_f16; + } + } else if (is_vision) { + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_rope_vision_f32; + } + if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_rope_vision_f16; + } + } else { + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_rope_norm_f32; + } + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_rope_norm_f32_f16; + } + if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_rope_norm_f16; + } + } + return nullptr; + } + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_sum_rows_f32; + } + return nullptr; + case GGML_OP_CUMSUM: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + if (src0->ne[0] <= 512) { + return ctx->device->pipeline_cumsum_small_f32; + } else { + return ctx->device->pipeline_cumsum_f32; + } + } + return nullptr; + case GGML_OP_SOLVE_TRI: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + + vk_solve_tri_pipeline_state solve_tri_pipeline_state(src0->ne[0], src1->ne[0]); + + vk_pipeline pipeline = nullptr; + + { + std::lock_guard guard(ctx->device->compile_mutex); + auto it = ctx->device->pipeline_solve_tri_f32.find(solve_tri_pipeline_state); + if (it != ctx->device->pipeline_solve_tri_f32.end()) { + pipeline = it->second; + } else { + ctx->device->pipeline_solve_tri_f32[solve_tri_pipeline_state] = pipeline = std::make_shared(); + } + } + + return pipeline; + } + return nullptr; + case GGML_OP_ARGMAX: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_I32) { + return ctx->device->pipeline_argmax_f32; + } + return nullptr; + case GGML_OP_COUNT_EQUAL: + if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I64) { + return ctx->device->pipeline_count_equal_i32; + } + return nullptr; + case GGML_OP_IM2COL: + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_im2col_f32; + } + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_im2col_f32_f16; + } + return nullptr; + case GGML_OP_IM2COL_3D: + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_im2col_3d_f32; + } + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_im2col_3d_f32_f16; + } + return nullptr; + case GGML_OP_TIMESTEP_EMBEDDING: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_timestep_embedding_f32; + } + return nullptr; + case GGML_OP_CONV_TRANSPOSE_1D: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_conv_transpose_1d_f32; + } + return nullptr; + case GGML_OP_COL2IM_1D: + switch (src0->type) { + case GGML_TYPE_F32: return ctx->device->pipeline_col2im_1d_f32; + case GGML_TYPE_F16: return ctx->device->pipeline_col2im_1d_f16; + case GGML_TYPE_BF16: return ctx->device->pipeline_col2im_1d_bf16; + default: return nullptr; + } + case GGML_OP_POOL_2D: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_pool2d_f32; + } + return nullptr; + case GGML_OP_RWKV_WKV6: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_rwkv_wkv6_f32; + } + return nullptr; + case GGML_OP_RWKV_WKV7: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_rwkv_wkv7_f32; + } + return nullptr; + case GGML_OP_GATED_DELTA_NET: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + const uint32_t S_v = dst->src[2]->ne[0]; + const uint32_t kda = (dst->src[3]->ne[0] == (int64_t)S_v) ? 1 : 0; + uint32_t si; + switch (S_v) { + case 16: si = 0; break; + case 32: si = 1; break; + case 64: si = 2; break; + case 128: si = 3; break; + default: return nullptr; + } + return ctx->device->pipeline_gated_delta_net[si][kda]; + } + return nullptr; + case GGML_OP_SSM_SCAN: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + const uint32_t d_state = src0->ne[0]; + if (d_state == 128) { + return ctx->device->pipeline_ssm_scan_f32_d128; + } else if (d_state == 256) { + return ctx->device->pipeline_ssm_scan_f32_d256; + } + } + return nullptr; + case GGML_OP_SSM_CONV: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + switch (ctx->num_additional_fused_ops) { + case 0: return ctx->device->pipeline_ssm_conv_f32; + case 1: return ctx->device->pipeline_ssm_conv_silu_f32; + case 2: return ctx->device->pipeline_ssm_conv_bias_silu_f32; + default: return nullptr; + } + } + return nullptr; + case GGML_OP_OPT_STEP_ADAMW: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_opt_step_adamw_f32; + } + return nullptr; + case GGML_OP_OPT_STEP_SGD: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_opt_step_sgd_f32; + } + return nullptr; + case GGML_OP_LEAKY_RELU: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_leaky_relu_f32; + } + return nullptr; + case GGML_OP_CONV_2D: + case GGML_OP_CONV_TRANSPOSE_2D: + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + uint32_t K = dst->ne[2]; // Cout + uint32_t NPQ = dst->ne[3] * dst->ne[1] * dst->ne[0]; // N * OH * OW + vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, K, NPQ); + + bool transpose = dst->op == GGML_OP_CONV_TRANSPOSE_2D; + uint32_t KW = (uint32_t)src0->ne[0]; + uint32_t KH = (uint32_t)src0->ne[1]; + uint32_t s0 = (uint32_t)(ggml_get_op_params_i32(dst, 0)); + uint32_t s1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 1) : s0; + uint32_t p0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 2) : 0; + uint32_t p1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 3) : 0; + uint32_t d0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 4) : 1; + uint32_t d1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 5) : 1; + + // tile-aligned shapes let the shader skip bounds checks + const uint32_t Cin = (uint32_t)src1->ne[2]; + const uint32_t CRS = Cin * KW * KH; + const uint32_t BS_K = vk_conv_block_sizes[shape].K; + const uint32_t BS_CRS = vk_conv_block_sizes[shape].CRS; + const uint32_t BS_NPQ = vk_conv_block_sizes[shape].NPQ; + const uint32_t aligned = ((K % BS_K == 0) && + (CRS % BS_CRS == 0) && + (NPQ % BS_NPQ == 0)) ? 1u : 0u; + + vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH, aligned); + + std::map *pipelines = nullptr; + if (op == GGML_OP_CONV_2D) { + if (src0->type == GGML_TYPE_F32) { + pipelines = &ctx->device->pipeline_conv2d_f32[shape]; + } else if (src0->type == GGML_TYPE_F16) { + pipelines = &ctx->device->pipeline_conv2d_f16_f32[shape]; + } + } else if (op == GGML_OP_CONV_TRANSPOSE_2D) { + if (src0->type == GGML_TYPE_F32) { + pipelines = &ctx->device->pipeline_conv_transpose_2d_f32[shape]; + } else if (src0->type == GGML_TYPE_F16) { + pipelines = &ctx->device->pipeline_conv_transpose_2d_f16_f32[shape]; + } + } + + vk_pipeline pipeline = nullptr; + + { + std::lock_guard guard(ctx->device->compile_mutex); + auto it = pipelines->find(conv2d_pipeline_state); + if (it != pipelines->end()) { + pipeline = it->second; + } else { + (*pipelines)[conv2d_pipeline_state] = pipeline = std::make_shared(); + } + } + + return pipeline; + } + return nullptr; + case GGML_OP_CONV_2D_DW: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + if (ggml_is_contiguous(src1)) { + return ctx->device->pipeline_conv2d_dw_whcn_f32; + } else if (ggml_is_contiguous_channels(src1)) { + return ctx->device->pipeline_conv2d_dw_cwhn_f32; + } + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + if (ggml_is_contiguous(src1)) { + return ctx->device->pipeline_conv2d_dw_whcn_f16_f32; + } else if (ggml_is_contiguous_channels(src1)) { + return ctx->device->pipeline_conv2d_dw_cwhn_f16_f32; + } + } + return nullptr; + case GGML_OP_ADD1: + if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_add1_f16_f16; + } + if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_add1_f16_f32; + } + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_add1_f32_f32; + } + return nullptr; + case GGML_OP_ARANGE: + if (dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_arange_f32; + } + return nullptr; + case GGML_OP_FILL: + if (dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_fill_f32; + } + if (dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_fill_f16; + } + return nullptr; + default: + return nullptr; + } + + GGML_UNUSED(src2); +} + +template +static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc) { + VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + if (src1 != nullptr) { + std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; + } + if (src2 != nullptr) { + std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3]; + } + if (src3 != nullptr) { + std::cerr << "), (" << src3 << ", name=" << src3->name << ", type=" << src3->type << ", ne0=" << src3->ne[0] << ", ne1=" << src3->ne[1] << ", ne2=" << src3->ne[2] << ", ne3=" << src3->ne[3] << ", nb0=" << src3->nb[0] << ", nb1=" << src3->nb[1] << ", nb2=" << src3->nb[2] << ", nb3=" << src3->nb[3]; + } + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << "), " << ggml_op_name(op) << ")"); + GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT + GGML_ASSERT(dst->buffer != nullptr); + const uint64_t ne00 = src0->ne[0]; + const uint64_t ne01 = src0->ne[1]; + const uint64_t ne02 = src0->ne[2]; + const uint64_t ne03 = src0->ne[3]; + + const bool use_src1 = src1 != nullptr; + const uint64_t ne10 = use_src1 ? src1->ne[0] : 0; + const uint64_t ne11 = use_src1 ? src1->ne[1] : 0; + const uint64_t ne12 = use_src1 ? src1->ne[2] : 0; + const uint64_t ne13 = use_src1 ? src1->ne[3] : 0; + + const bool use_src2 = src2 != nullptr; + const bool use_src3 = src3 != nullptr; + + init_pushconst_fastdiv(pc); + + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op); + + if (pipeline == nullptr) { + std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type); + if (src1 != nullptr) { + std::cerr << " and " << ggml_type_name(src1->type); + } + std::cerr << " to " << ggml_type_name(dst->type) << std::endl; + GGML_ABORT("fatal error"); + } + + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + + vk_subbuffer src0_buf = ggml_vk_tensor_subbuffer(ctx, src0, true); + vk_subbuffer src1_buf = use_src1 ? ggml_vk_tensor_subbuffer(ctx, src1, true) : vk_subbuffer{}; + vk_subbuffer src2_buf = use_src2 ? ggml_vk_tensor_subbuffer(ctx, src2, true) : vk_subbuffer{}; + vk_subbuffer src3_buf = use_src3 ? ggml_vk_tensor_subbuffer(ctx, src3, true) : vk_subbuffer{}; + vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, true); + + // Compute misalignment offset for descriptors and store it in in push constants. + init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, src3, dst); + + std::array elements; + + switch (op) { + case GGML_OP_NORM: + case GGML_OP_RMS_NORM_BACK: + case GGML_OP_L2_NORM: + case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_MAX_BACK: + case GGML_OP_SUM_ROWS: + case GGML_OP_CUMSUM: + case GGML_OP_MEAN: + case GGML_OP_ARGMAX: + { + const uint32_t nr = ggml_nrows(src0); + if (nr > 262144) { + elements = { 512, 512, CEIL_DIV(nr, 262144) }; + } else if (nr > 512) { + elements = { 512, CEIL_DIV(nr, 512), 1 }; + } else { + elements = { nr, 1, 1 }; + } + } break; + case GGML_OP_SOLVE_TRI: + { + uint32_t nr = (uint32_t)(ne02 * ne03); + if (nr > 262144) { + elements = { 512, 512, CEIL_DIV(nr, 262144) }; + } else if (nr > 512) { + elements = { 512, CEIL_DIV(nr, 512), 1 }; + } else { + elements = { nr, 1, 1 }; + } + } + break; + case GGML_OP_RMS_NORM: + if (ctx->do_add_rms_partials) { + // Run one element per thread, 128 threads per workgroup + elements = { (uint32_t)CEIL_DIV(ne00, 128), 1, 1 }; + } else { + elements = { (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne03 }; + } + break; + + case GGML_OP_SUM: + // We use GGML_OP_SUM_ROWS with 1 row. + elements = { 1, 1, 1 }; + break; + case GGML_OP_GROUP_NORM: + { + const uint32_t num_groups = dst->op_params[0]; + elements = { num_groups * (uint32_t)src0->ne[3], 1, 1 }; + } break; + case GGML_OP_DIAG_MASK_INF: + elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 }; + break; + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + { + uint32_t nrows = (uint32_t)ggml_nrows(src0); + uint32_t z = 1; + if (nrows > ctx->device->properties.limits.maxComputeWorkGroupCount[0]) { + z = CEIL_DIV(nrows, 32768); + nrows = 32768; + } + elements = { nrows, (uint32_t)ne00, z }; + + } break; + case GGML_OP_GET_ROWS: + elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) }; + elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]); + elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]); + break; + case GGML_OP_ARGSORT: + GGML_ASSERT(0); + break; + case GGML_OP_IM2COL: + { + const bool is_2D = dst->op_params[6] == 1; + + const uint32_t IC = src1->ne[is_2D ? 2 : 1]; + + const uint32_t KH = is_2D ? src0->ne[1] : 1; + const uint32_t KW = src0->ne[0]; + + const uint32_t OH = is_2D ? dst->ne[2] : 1; + const uint32_t OW = dst->ne[1]; + + const uint32_t batch = src1->ne[is_2D ? 3 : 2]; + + const uint32_t CHW = IC * KH * KW; + // Cap X workgroups to limit concurrent IC channel reads. + // The shader loops over X to cover the full CHW dimension. + // AMD prefers a lower limit + const uint32_t min_cap = ctx->device->vendor_id == VK_VENDOR_ID_AMD ? 512u : 4096u; + const uint32_t x_elements = std::min(CHW, std::max(min_cap, OW * KH * KW)); + elements = { x_elements, OW, OH * batch }; + elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]); + elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]); + } break; + case GGML_OP_IM2COL_3D: + { + const uint32_t IC = ((const uint32_t *)(dst->op_params))[9]; + + const uint32_t N = ne13 / IC; + + const uint32_t KD = ne02; + const uint32_t KH = ne01; + const uint32_t KW = ne00; + + const uint32_t OD = dst->ne[3] / N; + const uint32_t OH = dst->ne[2]; + const uint32_t OW = dst->ne[1]; + + const uint32_t IC_KD_KH_KW = IC*KD*KH*KW; + const uint32_t N_OD_OH = N*OD*OH; + + elements = { IC_KD_KH_KW, OW, N_OD_OH }; + elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]); + } break; + case GGML_OP_TIMESTEP_EMBEDDING: + { + const uint32_t dim = dst->op_params[0]; + uint32_t half_ceil = (dim + 1) / 2; + elements = { half_ceil, (uint32_t)src0->ne[0], 1 }; + } break; + case GGML_OP_CONV_TRANSPOSE_1D: + { + elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1} + } break; + case GGML_OP_COL2IM_1D: + { + elements = { uint32_t(dst->ne[0]), uint32_t(dst->ne[1]), 1 }; + } break; + case GGML_OP_POOL_2D: + { + const uint32_t N = dst->ne[3]; + const uint32_t OC = dst->ne[2]; + const uint32_t OH = dst->ne[1]; + const uint32_t OW = dst->ne[0]; + elements = { N * OC * OH * OW, 1, 1}; + } break; + case GGML_OP_CONV_2D: + case GGML_OP_CONV_TRANSPOSE_2D: + if constexpr (std::is_same_v) { + const uint32_t NPQ = pc.N * pc.OH * pc.OW; + const vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, pc.Cout, NPQ); + const uint32_t NPQ_blocks = CEIL_DIV(NPQ, vk_conv_block_sizes[shape].NPQ); + + elements = { pc.Cout, NPQ_blocks, 1 }; + if (elements[1] > 512) { + elements[2] = CEIL_DIV(elements[1], 512); + elements[1] = 512; + } + } else { + GGML_ABORT("invalid push constant type for CONV_2D"); + } + break; + case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_DIV: + case GGML_OP_MUL: + case GGML_OP_ADD1: + case GGML_OP_ARANGE: + case GGML_OP_FILL: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_SIN: + case GGML_OP_COS: + case GGML_OP_LOG: + case GGML_OP_TRI: + case GGML_OP_DIAG: + case GGML_OP_CLAMP: + case GGML_OP_PAD: + case GGML_OP_ROLL: + case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: + case GGML_OP_CPY: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: + case GGML_OP_UNARY: + case GGML_OP_GLU: + case GGML_OP_CONV_2D_DW: + { + uint32_t ne = ggml_nelements(dst); + if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) { + // Convert from number of logical elements to 2- or 4-byte units. + ne /= ggml_blck_size(src0->type); + if ((ggml_type_size(src0->type) % 4) == 0) { + ne *= ggml_type_size(src0->type) / 4; + } else { + ne *= ggml_type_size(src0->type) / 2; + } + } + // copy_to_quant has block size of 32, and each thread does QUANT_K elements. + // Splitting into 512x512xZ wouldn't work well since each workgroup does 1024 elements. + // So divide by block size here before splitting into 512x512 groups. + if (op == GGML_OP_CPY && !ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) { + ne = CEIL_DIV(ne, ggml_blck_size(dst->type)); + } + if (ne > 262144) { + elements = { 512, 512, CEIL_DIV(ne, 262144) }; + } else if (ne > 512) { + elements = { 512, CEIL_DIV(ne, 512), 1 }; + } else { + elements = { ne, 1, 1 }; + } + + if (pipeline == ctx->device->pipeline_cpy_transpose_32 || + pipeline == ctx->device->pipeline_cpy_transpose_16) { + // 32x32 tiles + elements[0] = (uint32_t)CEIL_DIV(dst->ne[0], 32); + elements[1] = (uint32_t)CEIL_DIV(dst->ne[1], 32); + elements[2] = (uint32_t)(dst->ne[2]*dst->ne[3]); + elements[0] = std::min(elements[0], ctx->device->properties.limits.maxComputeWorkGroupCount[0]); + elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]); + elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]); + } + } break; + case GGML_OP_ADD_ID: + { + elements = { (uint32_t)ne01, (uint32_t)ne02, 1 }; + } break; + case GGML_OP_SET_ROWS: + { + uint32_t ne = ggml_nelements(src0); + if (ggml_is_quantized(dst->type)) { + // quants run 32 threads each doing QUANT_K elements + ne = CEIL_DIV(ne, 32 * ggml_blck_size(dst->type)); + } else { + // scalar types do one element per thread, running 512 threads + ne = CEIL_DIV(ne, 512); + } + if (ne > 262144) { + elements = { 512, 512, CEIL_DIV(ne, 262144) }; + } else if (ne > 512) { + elements = { 512, CEIL_DIV(ne, 512), 1 }; + } else { + elements = { ne, 1, 1 }; + } + } + break; + case GGML_OP_SSM_CONV: + { + const uint32_t nr = src0->ne[1]; + const uint32_t n_t = dst->ne[1]; + const uint32_t n_s = dst->ne[2]; + elements = { nr, n_t, n_s }; + } + break; + default: + elements = { (uint32_t)ggml_nelements(src0), 1, 1 }; + break; + } + + if (op == GGML_OP_ADD || op == GGML_OP_RMS_NORM) { + vk_subbuffer a_buf = src0_buf; + if (ctx->do_add_rms_partials) { + a_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_add_rms_partials, ctx->prealloc_size_add_rms_partials_offset); + } + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + { src0_buf, src1_buf, dst_buf, a_buf }, pc, elements); + } else if (op == GGML_OP_GLU) { + // Empty src1 is possible in glu, but the shader needs a buffer + vk_subbuffer subbuf1 = use_src1 ? src1_buf : src0_buf; + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc, elements); + } else if (op == GGML_OP_SOFT_MAX) { + // Empty src1 and src2 is possible in soft_max, but the shader needs a buffer + vk_subbuffer subbuf1 = use_src1 ? src1_buf : src0_buf; + vk_subbuffer subbuf2 = use_src2 ? src2_buf : src0_buf; + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, subbuf2, dst_buf }, pc, elements); + } else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) { + // Empty src2 and src3 is possible in rope, but the shader needs a buffer + vk_subbuffer subbuf2 = use_src2 ? src2_buf : src0_buf; + vk_subbuffer subbuf3 = use_src3 ? src3_buf : src0_buf; + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, subbuf2, dst_buf, subbuf3 }, pc, elements); + } else if (op == GGML_OP_IM2COL || op == GGML_OP_IM2COL_3D) { + if (ctx->device->shader_int64 && ctx->device->buffer_device_address) { + // buffer device address path doesn't use dst buffer + dst_buf.size = 1; + } + // im2col uses only src1 and dst buffers + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src1_buf, dst_buf }, pc, elements); + } else if (op == GGML_OP_COUNT_EQUAL) { + // count_equal assumes that destination buffer is initialized with zeroes + ggml_vk_buffer_memset_async(subctx, dst_buf.buffer, dst_buf.offset, 0, dst_buf.size); + ggml_vk_sync_buffers(ctx, subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, dst_buf }, pc, elements); + } else if (op == GGML_OP_OPT_STEP_SGD) { + // OPT_STEP_SGD works on src0, it does not need dst + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf }, pc, elements); + } else if (use_src3) { + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf, src3_buf, dst_buf }, pc, elements); + } else if (use_src2) { + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf, dst_buf }, pc, elements); + } else if (use_src1) { + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, dst_buf }, pc, elements); + } else { + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, dst_buf }, pc, elements); + } +} + +void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GET_ROWS, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, 0, + }); +} + +void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + int nb1 = dst->op_params[0] / src0_type_size; // 4 bytes of float32 + int nb2 = dst->op_params[1] / src0_type_size; // 4 bytes of float32 + int nb3 = dst->op_params[2] / src0_type_size; // 4 bytes of float32 + int offset = dst->op_params[3] / src0_type_size; // offset in bytes + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, dst->op, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)nb3, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)nb3, + 0, + 0.0f, 0.0f, offset, + }); +} + +void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) { + const ggml_tensor *first_node = cgraph->nodes[node_idx]; + const ggml_tensor *dst = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; + + // Make a list of all the tensors used by the op. + // Last element of the list is the dest tensor. + const ggml_tensor *tensors[MAX_PARAMETER_COUNT]; + uint32_t num_srcs = ctx->num_additional_fused_ops + 2; + uint32_t num_tensors = num_srcs + 1; + GGML_ASSERT(num_tensors + ctx->do_add_rms_partials <= MAX_PARAMETER_COUNT); + + tensors[0] = first_node->src[0]; + tensors[1] = first_node->src[1]; + for (int32_t i = 0; i < ctx->num_additional_fused_ops; ++i) { + // check whether the previous result is src[0] or src[1] + if (cgraph->nodes[node_idx + i] == cgraph->nodes[node_idx + i + 1]->src[0]) { + tensors[i+2] = cgraph->nodes[node_idx + i + 1]->src[1]; + } else { + tensors[i+2] = cgraph->nodes[node_idx + i + 1]->src[0]; + } + } + tensors[num_srcs] = dst; + + vk_op_multi_add_push_constants pc; + pc.ne20 = (uint32_t)dst->ne[0]; + pc.ne21 = (uint32_t)dst->ne[1]; + pc.ne22 = (uint32_t)dst->ne[2]; + pc.ne23 = (uint32_t)dst->ne[3]; + + for (uint32_t i = 0; i < num_tensors; ++i) { + const ggml_tensor *t = tensors[i]; + pc.nb[i][0] = (uint32_t)t->nb[0] / sizeof(float); + pc.nb[i][1] = (uint32_t)t->nb[1] / sizeof(float); + pc.nb[i][2] = (uint32_t)t->nb[2] / sizeof(float); + pc.nb[i][3] = (uint32_t)t->nb[3] / sizeof(float); + } + pc.rms_partials = ctx->do_add_rms_partials; + + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, tensors[0], tensors[1], nullptr, dst, dst->op); + + if (pipeline == nullptr) { + std::cerr << "ggml_vulkan: Error: Missing multi_add"; + GGML_ABORT("fatal error"); + } + + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + + ggml_backend_vk_buffer_context * buf_ctx[MAX_PARAMETER_COUNT]; + vk_buffer buf[MAX_PARAMETER_COUNT]; + size_t offset[MAX_PARAMETER_COUNT]; + bool uma[MAX_PARAMETER_COUNT]; + + for (uint32_t i = 0; i < num_tensors; ++i) { + buf_ctx[i] = (ggml_backend_vk_buffer_context *)tensors[i]->buffer->context; + buf[i] = nullptr; + offset[i] = 0; + uma[i] = false; + + if (ctx->device->uma) { + ggml_vk_host_get(ctx->device, tensors[i]->data, buf[i], offset[i]); + uma[i] = buf[i] != nullptr; + } + if (!uma[i]) { + buf[i] = buf_ctx[i]->dev_buffer; + offset[i] = vk_tensor_offset(tensors[i]) + tensors[i]->view_offs; + } + GGML_ASSERT(buf[i] != nullptr); + } + // If any remaining descriptors are unused, just point them at src[0] + for (uint32_t i = num_tensors; i < MAX_PARAMETER_COUNT; ++i) { + buf[i] = buf[0]; + offset[i] = 0; + } + if (ctx->do_add_rms_partials) { + buf[num_tensors] = ctx->prealloc_add_rms_partials; + offset[num_tensors] = ctx->prealloc_size_add_rms_partials_offset; + } + + std::array elements; + + uint32_t ne = ggml_nelements(dst); + if (ne > 262144) { + elements = { 512, 512, CEIL_DIV(ne, 262144) }; + } else if (ne > 512) { + elements = { 512, CEIL_DIV(ne, 512), 1 }; + } else { + elements = { ne, 1, 1 }; + } + + static_assert(MAX_PARAMETER_COUNT == 12); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + { + ggml_vk_subbuffer(ctx, buf[0], offset[0]), + ggml_vk_subbuffer(ctx, buf[1], offset[1]), + ggml_vk_subbuffer(ctx, buf[2], offset[2]), + ggml_vk_subbuffer(ctx, buf[3], offset[3]), + ggml_vk_subbuffer(ctx, buf[4], offset[4]), + ggml_vk_subbuffer(ctx, buf[5], offset[5]), + ggml_vk_subbuffer(ctx, buf[6], offset[6]), + ggml_vk_subbuffer(ctx, buf[7], offset[7]), + ggml_vk_subbuffer(ctx, buf[8], offset[8]), + ggml_vk_subbuffer(ctx, buf[9], offset[9]), + ggml_vk_subbuffer(ctx, buf[10], offset[10]), + ggml_vk_subbuffer(ctx, buf[11], offset[11]), + }, pc, elements); +} + +void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_ADD, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, ctx->do_add_rms_partials, + }); +} + +void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SUB, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, 0, + }); +} + +void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_MUL, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, 0, + }); +} + +void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_DIV, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, 0, + }); +} + +void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t src2_type_size = ggml_type_size(src2->type); + + ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_ADD_ID, { + (uint32_t)dst->ne[0], + (uint32_t)dst->ne[1], + (uint32_t)src0->nb[1] / src0_type_size, + (uint32_t)src0->nb[2] / src0_type_size, + (uint32_t)src1->nb[1] / src1_type_size, + (uint32_t)src2->nb[1] / src2_type_size, + }); +} + +static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, int version) { + GGML_ASSERT(version == 6 || version == 7); + int num_srcs = version == 6 ? 6 : 7; + + for (int i = 0; i < num_srcs; i++) { + GGML_ASSERT(!ggml_is_quantized(dst->src[i]->type)); + } + + GGML_ASSERT(dst->buffer != nullptr); + + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, dst->src[0], dst->src[1], dst->src[2], dst, dst->op); + GGML_ASSERT(pipeline != nullptr); + + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + + vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); + vk_subbuffer src_buf[7] = {}; + for (int i = 0; i < num_srcs; i++) { + src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]); + } + + std::array elements = { + (uint32_t)(pc.B * pc.H), + 1, + 1 + }; + + if (version == 6) { + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], dst_buf}, + pc, elements); + } else if (version == 7) { + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], src_buf[6], dst_buf}, + pc, elements); + } else { + // shouldn't happen + GGML_ASSERT(false); + } +} + +void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { + const size_t seq_length = dst->src[0]->ne[2]; + const size_t n_embed = dst->ne[0]; + const size_t n_heads = dst->src[0]->ne[1]; + const size_t n_seqs = dst->src[5]->ne[1]; + + ggml_vk_op_f32_wkv( + ctx, subctx, dst, + { + (uint32_t)n_seqs, + (uint32_t)seq_length, + (uint32_t)n_embed, + (uint32_t)n_heads, + }, + 6 + ); +} + +void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { + const size_t seq_length = dst->src[0]->ne[2]; + const size_t n_embed = dst->ne[0]; + const size_t n_heads = dst->src[0]->ne[1]; + const size_t n_seqs = dst->src[6]->ne[1]; + + ggml_vk_op_f32_wkv( + ctx, subctx, dst, + { + (uint32_t)n_seqs, + (uint32_t)seq_length, + (uint32_t)n_embed, + (uint32_t)n_heads, + }, + 7 + ); +} + +void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { + const ggml_tensor * src_q = dst->src[0]; + const ggml_tensor * src_v = dst->src[2]; + const ggml_tensor * src_beta = dst->src[4]; + + GGML_ASSERT(dst->buffer != nullptr); + + const uint32_t S_v = (uint32_t)src_v->ne[0]; + const uint32_t H = (uint32_t)src_v->ne[1]; + const uint32_t n_tokens = (uint32_t)src_v->ne[2]; + const uint32_t n_seqs = (uint32_t)src_v->ne[3]; + + // K (snapshot slot count) is an op param; state holds s0 only [S_v, S_v, H, n_seqs]. + const uint32_t K = (uint32_t)ggml_get_op_params_i32(dst, 0); + + const uint32_t s_off = S_v * H * n_tokens * n_seqs; + + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, dst->src[0], dst->src[1], dst->src[2], dst, dst->op); + GGML_ASSERT(pipeline != nullptr); + + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + + vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); + vk_subbuffer src_buf[6] = {}; + for (int i = 0; i < 6; i++) { + src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]); + } + + const uint32_t sq1 = (uint32_t)(src_q->nb[1] / sizeof(float)); + const uint32_t sq2 = (uint32_t)(src_q->nb[2] / sizeof(float)); + const uint32_t sq3 = (uint32_t)(src_q->nb[3] / sizeof(float)); + const uint32_t sv1 = (uint32_t)(src_v->nb[1] / sizeof(float)); + const uint32_t sv2 = (uint32_t)(src_v->nb[2] / sizeof(float)); + const uint32_t sv3 = (uint32_t)(src_v->nb[3] / sizeof(float)); + const uint32_t sb1 = (uint32_t)(src_beta->nb[1] / sizeof(float)); + const uint32_t sb2 = (uint32_t)(src_beta->nb[2] / sizeof(float)); + const uint32_t sb3 = (uint32_t)(src_beta->nb[3] / sizeof(float)); + + const uint32_t neq1 = (uint32_t)src_q->ne[1]; + const uint32_t rq3 = (uint32_t)(src_v->ne[3] / src_q->ne[3]); + + const float scale = 1.0f / sqrtf((float)S_v); + const vk_op_gated_delta_net_push_constants pc = { + H, n_tokens, n_seqs, s_off, + sq1, sq2, sq3, + sv1, sv2, sv3, + sb1, sb2, sb3, + neq1, rq3, + scale, + K + }; + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], dst_buf}, + pc, { H, n_seqs, S_v }); +} + +void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const ggml_tensor * src2 = dst->src[2]; + const ggml_tensor * src3 = dst->src[3]; + const ggml_tensor * src4 = dst->src[4]; + const ggml_tensor * src5 = dst->src[5]; + + GGML_ASSERT(dst->buffer != nullptr); + + const uint32_t head_dim = src0->ne[1]; + const uint32_t n_head = src1->ne[1]; + const uint32_t n_group = src4->ne[1]; + const uint32_t n_tok = src1->ne[2]; + const uint32_t n_seq = src1->ne[3]; + + bool is_mamba2 = (src3->nb[1] == sizeof(float)); + GGML_ASSERT(is_mamba2); + + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, dst->op); + GGML_ASSERT(pipeline != nullptr); + + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + + const int64_t s_off = ggml_nelements(src1) * sizeof(float); + + const vk_op_ssm_scan_push_constants pc = { + (uint32_t)src0->nb[2], (uint32_t)src0->nb[3], + (uint32_t)src1->nb[2], (uint32_t)src1->nb[3], + (uint32_t)src2->nb[1], (uint32_t)src2->nb[2], + (uint32_t)src3->nb[1], + (uint32_t)src4->nb[2], (uint32_t)src4->nb[3], + (uint32_t)src5->nb[2], (uint32_t)src5->nb[3], + (uint32_t)s_off, + n_head, head_dim, n_group, n_tok + }; + + vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); + vk_subbuffer src_buf[7] = {}; + for (int i = 0; i < 7 && dst->src[i] != nullptr; i++) { + src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]); + } + + std::array elements; + + const uint32_t d_state = src0->ne[0]; + uint32_t num_subgroups = d_state / ctx->device->subgroup_size; + const uint32_t num_workgroups_x = CEIL_DIV(n_head * head_dim, num_subgroups); + const uint32_t num_workgroups_y = n_seq; + elements = { num_workgroups_x, num_workgroups_y, 1 }; + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], src_buf[6], dst_buf}, + pc, elements); +} + +void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { + ggml_tensor * conv = cgraph->nodes[node_idx]; + const ggml_tensor * src0 = conv->src[0]; + const ggml_tensor * src1 = conv->src[1]; + + // Pick the destination tensor (last node in the fused chain) and the optional bias. + // Fusion modes: 0 = ssm_conv, 1 = ssm_conv+silu, 2 = ssm_conv+add(bias)+silu. + ggml_tensor * dst = conv; + const ggml_tensor * bias = nullptr; + + if (ctx->num_additional_fused_ops == 1) { + dst = cgraph->nodes[node_idx + 1]; // silu + } else if (ctx->num_additional_fused_ops == 2) { + ggml_tensor * add = cgraph->nodes[node_idx + 1]; + bias = (add->src[0] == conv) ? add->src[1] : add->src[0]; + dst = cgraph->nodes[node_idx + 2]; // silu + } + + // The shader always declares 4 bindings; bind src0 as a dummy when bias isn't fused. + const ggml_tensor * src2 = bias ? bias : src0; + + ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SSM_CONV, { + (uint32_t)src0->nb[1], (uint32_t)src0->nb[2], + (uint32_t)src1->nb[1], + (uint32_t)dst->nb[0], (uint32_t)dst->nb[1], (uint32_t)dst->nb[2], + (uint32_t)src1->ne[0], + (uint32_t)src0->ne[0], + (uint32_t)src0->ne[1], + (uint32_t)dst->ne[1], + (uint32_t)dst->ne[2], + }); +} + +static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_push_constants&& pc) { + const ggml_tensor * x = dst->src[0]; + const ggml_tensor * g = dst->src[1]; + const ggml_tensor * gm = dst->src[2]; + const ggml_tensor * gv = dst->src[3]; + const ggml_tensor * p = dst->src[4]; + + GGML_ASSERT(x->type == GGML_TYPE_F32); + GGML_ASSERT(g->type == GGML_TYPE_F32); + GGML_ASSERT(gm->type == GGML_TYPE_F32); + GGML_ASSERT(gv->type == GGML_TYPE_F32); + GGML_ASSERT(p->type == GGML_TYPE_F32); + GGML_ASSERT(dst->buffer != nullptr); + GGML_ASSERT(ggml_is_contiguous(x)); + GGML_ASSERT(ggml_is_contiguous(g)); + GGML_ASSERT(ggml_is_contiguous(gm)); + GGML_ASSERT(ggml_is_contiguous(gv)); + GGML_ASSERT(ggml_is_contiguous(p)); + GGML_ASSERT(ggml_are_same_shape(x, g)); + GGML_ASSERT(ggml_are_same_shape(x, gm)); + GGML_ASSERT(ggml_are_same_shape(x, gv)); + GGML_ASSERT(ggml_nelements(p) == 7); + + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, g, gm, gv, dst, GGML_OP_OPT_STEP_ADAMW); + GGML_ASSERT(pipeline != nullptr); + + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + + vk_subbuffer x_buf = ggml_vk_tensor_subbuffer(ctx, x); + vk_subbuffer g_buf = ggml_vk_tensor_subbuffer(ctx, g); + vk_subbuffer gm_buf = ggml_vk_tensor_subbuffer(ctx, gm); + vk_subbuffer gv_buf = ggml_vk_tensor_subbuffer(ctx, gv); + vk_subbuffer p_buf = ggml_vk_tensor_subbuffer(ctx, p); + + std::array elements = { (uint32_t)ggml_nelements(x), 1, 1 }; + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + {x_buf, g_buf, gm_buf, gv_buf, p_buf}, + pc, elements); +} + +void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { + const size_t n = ggml_nelements(dst->src[0]); + + ggml_vk_op_f32_opt_step_adamw( + ctx, subctx, dst, + { (uint32_t)n, 0, 0.0f, 0.0f, 0.0f, 0.0f } + ); +} + +void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { + const size_t n = ggml_nelements(dst->src[0]); + + ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f, 0.0f, 0.0f }); +} + +void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + int * op_params = (int *)dst->op_params; + + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONCAT, { + (uint32_t)ggml_nelements(dst), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, op_params[0], + }); +} + +void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0); + + GGML_TENSOR_UNARY_OP_LOCALS + + float sf0 = (float)ne0 / ne00; + float sf1 = (float)ne1 / ne01; + float sf2 = (float)ne2 / ne02; + float sf3 = (float)ne3 / ne03; + float pixel_offset = 0.5f; + + if (mode & GGML_SCALE_FLAG_ALIGN_CORNERS) { + sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0; + sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1; + pixel_offset = 0.0f; + } + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UPSCALE, { + (uint32_t)ggml_nelements(dst), 0, 0, + (uint32_t)ne00, (uint32_t)ne01, + (uint32_t)nb00 / src0_type_size, (uint32_t)nb01 / src0_type_size, (uint32_t)nb02 / src0_type_size, (uint32_t)nb03 / src0_type_size, + (uint32_t)ne0, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, + sf0, sf1, sf2, sf3, pixel_offset + }); +} + +void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = ggml_get_op_params_f32(dst, 0); + p.param2 = ggml_get_op_params_f32(dst, 1); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p)); +} + +void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst)); +} + +void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQRT, vk_op_unary_push_constants_init(src0, dst)); +} + +void ggml_vk_add1(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_ADD1, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, 0, + }); +} + +void ggml_vk_arange(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { + VK_LOG_DEBUG("ggml_vk_arange(dst=" << dst << ", ne=" << ggml_nelements(dst) << ")"); + + vk_op_push_constants pc = { + (uint32_t)ggml_nelements(dst), + 1, + ggml_get_op_params_f32(dst, 0), + ggml_get_op_params_f32(dst, 2), + 0.0f, 0.0f, + }; + + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, dst, GGML_OP_ARANGE); + GGML_ASSERT(pipeline != nullptr); + + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, false); + + std::array elements = { (uint32_t)ggml_nelements(dst), 1, 1 }; + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { dst_buf }, pc, elements); +} + +void ggml_vk_fill(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { + VK_LOG_DEBUG("ggml_vk_fill(dst=" << dst << ", ne=" << ggml_nelements(dst) << ")"); + + vk_op_push_constants pc = { + (uint32_t)ggml_nelements(dst), + 1, + ggml_get_op_params_f32(dst, 0), + 0.0f, + 0.0f, 0.0f, + }; + + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, dst, GGML_OP_FILL); + GGML_ASSERT(pipeline != nullptr); + + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, false); + + std::array elements = { (uint32_t)ggml_nelements(dst), 1, 1 }; + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { dst_buf }, pc, elements); +} + +void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst)); +} + +void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst)); +} + +void ggml_vk_log(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LOG, vk_op_unary_push_constants_init(src0, dst)); +} + +void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = ggml_get_op_params_f32(dst, 0); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p)); +} + +void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG, std::move(p)); +} + +void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = ggml_get_op_params_f32(dst, 0); + p.param2 = ggml_get_op_params_f32(dst, 1); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p)); +} + +void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + vk_op_pad_push_constants p = vk_op_pad_push_constants_init(src0, dst); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p)); +} + +void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const int32_t s0 = ggml_get_op_params_i32(dst, 0); + const int32_t s1 = ggml_get_op_params_i32(dst, 1); + const int32_t s2 = ggml_get_op_params_i32(dst, 2); + const int32_t s3 = ggml_get_op_params_i32(dst, 3); + const uint32_t s01_packed = ((s0 + 0x8000) << 16) | (s1 + 0x8000); + const uint32_t s23_packed = ((s2 + 0x8000) << 16) | (s3 + 0x8000); + + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + memcpy(&p.param1, &s01_packed, sizeof(float)); + memcpy(&p.param2, &s23_packed, sizeof(float)); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p)); +} + +void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p)); +} + +void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p)); +} + +void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + uint32_t ne = (uint32_t)ggml_nelements(src0); + if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) { + // Convert from number of logical elements to 2- or 4-byte units. + ne /= ggml_blck_size(src0->type); + if ((ggml_type_size(src0->type) % 4) == 0) { + ne *= ggml_type_size(src0->type) / 4; + } else { + ne *= ggml_type_size(src0->type) / 2; + } + } + + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p)); +} + +void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + // Skip empty skip_rows operations. For most ops the empty check at the start + // of ggml_vk_build_graph is sufficient, but set_rows can have a nonempty dst + // with empty srcs. + if (ggml_is_empty(src0) || ggml_is_empty(src1)) { + return; + } + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SET_ROWS, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, 0, + }); +} + +void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f }); +} + +void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + float * op_params = (float *)dst->op_params; + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f, 0.0f, 0.0f }); +} + +void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const int * int_op_params = (const int *)dst->op_params; + const float * float_op_params = (const float *)dst->op_params; + + const uint32_t num_groups = int_op_params[0]; + const float eps = float_op_params[1]; + const uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f, 0.0f, 0.0f }); +} + +static uint32_t ggml_vk_rms_num_partials(ggml_backend_vk_context * ctx, const ggml_tensor *node) { + const uint32_t ne = (uint32_t)node->ne[0]; + const uint32_t denom = ctx->device->pipeline_add_rms[0][0][0]->wg_denoms[0]; + const uint32_t num_partials = CEIL_DIV(ne, denom); + return num_partials; +} + +uint32_t ggml_vk_rms_partials_size(ggml_backend_vk_context * ctx, const ggml_tensor *node) { + const uint32_t num_partials = ggml_vk_rms_num_partials(ctx, node); + const uint32_t num_bytes = ROUNDUP_POW2(num_partials * sizeof(uint32_t), ctx->device->partials_binding_alignment); + return num_bytes; +} + +static vk_op_rope_push_constants ggml_vk_make_rope_constants(const ggml_tensor *dst, const ggml_tensor *src0, const bool has_ff, bool backprop, const uint32_t set_rows_stride) { + const int n_dims = ((const int32_t *) dst->op_params)[1]; + const int mode = ((const int32_t *) dst->op_params)[2]; + // const int n_ctx = ((const int32_t *) dst->op_params)[3]; + const int n_ctx_orig = ((const int32_t *) dst->op_params)[4]; + const float freq_base = ((const float *) dst->op_params)[5]; + const float freq_scale = ((const float *) dst->op_params)[6]; + const float ext_factor = ((const float *) dst->op_params)[7]; + const float attn_factor = ((const float *) dst->op_params)[8]; + const float beta_fast = ((const float *) dst->op_params)[9]; + const float beta_slow = ((const float *) dst->op_params)[10]; + int sections[4] {}; + if (mode & GGML_ROPE_TYPE_MROPE) { + memcpy(sections, (const int32_t *) dst->op_params + 11, sizeof(int)*4); + } + + const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; + + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + + uint32_t nb01 = src0->nb[1] / ggml_type_size(src0->type); + uint32_t nb02 = src0->nb[2] / ggml_type_size(src0->type); + uint32_t nb03 = src0->nb[3] / ggml_type_size(src0->type); + + uint32_t nb11 = dst->nb[1] / ggml_type_size(dst->type); + uint32_t nb12 = dst->nb[2] / ggml_type_size(dst->type); + uint32_t nb13 = dst->nb[3] / ggml_type_size(dst->type); + + vk_op_rope_push_constants rope { + (uint32_t)mode, (uint32_t)ggml_nrows(src0), (uint32_t)n_dims, freq_scale, + freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale, has_ff, + { sections[0], sections[1], sections[2], sections[3] }, is_imrope, backprop, set_rows_stride, + + (uint32_t)src0->ne[0], + (uint32_t)src0->ne[1], + (uint32_t)src0->ne[2], + nb01, nb02, nb03, + nb11, nb12, nb13, + 0, 0, // a_offset, d_offset filled in by init_pushconst_tensor_offsets + }; + + return rope; +} + +void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, float * op_params) { + ggml_tensor * dst; + const ggml_tensor * src0; + const ggml_tensor * src1; + + if (ctx->num_additional_fused_ops > 0) { + // fused rms_norm + mul + ggml_tensor *mul = cgraph->nodes[node_idx + 1]; + ggml_tensor *other_src = mul->src[0] == cgraph->nodes[node_idx + 0] ? mul->src[1] : mul->src[0]; + dst = mul; + src0 = cgraph->nodes[node_idx]->src[0]; + src1 = other_src; + } else { + dst = cgraph->nodes[node_idx]; + src0 = src1 = dst->src[0]; + } + + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + uint32_t param3 = ctx->do_add_rms_partials ? ggml_vk_rms_num_partials(ctx, dst) : 0; + + vk_op_binary_push_constants bin { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + op_params[0], 0.0f, (int32_t)param3, + }; + + // more than one fused op means rms_norm+mul+rope + if (ctx->num_additional_fused_ops > 1) { + static constexpr uint32_t max_tensors = 7; + const ggml_tensor *tensors[max_tensors] {}; + + ggml_tensor *rms = cgraph->nodes[node_idx + 0]; + ggml_tensor *mul = cgraph->nodes[node_idx + 1]; + ggml_tensor *rope = cgraph->nodes[node_idx + 2]; + + ggml_tensor *other_src = mul->src[0] == rms ? mul->src[1] : mul->src[0]; + + bool do_set_rows = ctx->num_additional_fused_ops == 4; + + tensors[0] = rms->src[0]; + tensors[1] = other_src; + tensors[2] = mul; + tensors[3] = rope->src[1]; // pos + tensors[4] = rope->src[2]; // ff + tensors[5] = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; // dst + tensors[6] = do_set_rows ? tensors[5]->src[1] : nullptr; + const uint32_t set_rows_stride = do_set_rows ? tensors[5]->nb[1] / ggml_type_size(tensors[5]->type) : 0; + + vk_op_rms_norm_mul_rope_push_constants pc; + pc.bin = bin; + pc.rope = ggml_vk_make_rope_constants(rope, rope->src[0], tensors[4] != nullptr, false, set_rows_stride); + + vk_pipeline pipeline = tensors[5]->type == GGML_TYPE_F16 ? ctx->device->pipeline_rms_norm_mul_rope_f32_f16 : ctx->device->pipeline_rms_norm_mul_rope_f32_f32; + + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + + ggml_backend_vk_buffer_context * buf_ctx[max_tensors]; + vk_buffer buf[max_tensors]; + size_t offset[max_tensors]; + bool uma[max_tensors]; + + for (uint32_t i = 0; i < max_tensors; ++i) { + if (!tensors[i]) { + // If any remaining descriptors are unused, just point them at src[0] + buf[i] = buf[0]; + offset[i] = 0; + continue; + } + buf_ctx[i] = (ggml_backend_vk_buffer_context *)tensors[i]->buffer->context; + buf[i] = nullptr; + offset[i] = 0; + uma[i] = false; + + if (ctx->device->uma) { + ggml_vk_host_get(ctx->device, tensors[i]->data, buf[i], offset[i]); + uma[i] = buf[i] != nullptr; + } + if (!uma[i]) { + buf[i] = buf_ctx[i]->dev_buffer; + offset[i] = vk_tensor_offset(tensors[i]) + tensors[i]->view_offs; + } + GGML_ASSERT(buf[i] != nullptr); + } + + // a_offset is unused (the fused path reads from shared memory), but the rope/set_rows dst can be misaligned. + // Round the binding offset down to the storage buffer alignment; the in-element shift goes in pc.rope.d_offset. + pc.rope.d_offset = get_misalign_bytes(ctx, tensors[5]) / ggml_type_size(tensors[5]->type); + offset[5] &= ~(size_t(ctx->device->properties.limits.minStorageBufferOffsetAlignment) - 1); + + std::array elements; + elements = { (uint32_t)rms->src[0]->ne[1], (uint32_t)rms->src[0]->ne[2], (uint32_t)rms->src[0]->ne[3] }; + + static_assert(max_tensors == 7); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + { + ggml_vk_subbuffer(ctx, buf[0], offset[0]), + ggml_vk_subbuffer(ctx, buf[1], offset[1]), + ggml_vk_subbuffer(ctx, buf[2], offset[2]), + ggml_vk_subbuffer(ctx, buf[3], offset[3]), + ggml_vk_subbuffer(ctx, buf[4], offset[4]), + ggml_vk_subbuffer(ctx, buf[5], offset[5]), + ggml_vk_subbuffer(ctx, buf[6], offset[6]), + }, pc, elements); + } else { + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM, std::move(bin)); + } + + if (ctx->do_add_rms_partials_offset_calculation) { + ctx->prealloc_size_add_rms_partials_offset += ggml_vk_rms_partials_size(ctx, src0); + ctx->do_add_rms_partials = false; + ctx->do_add_rms_partials_offset_calculation = false; + } +} + +void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + float * op_params = (float *)dst->op_params; + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f, 0.0f, 0.0f }); +} + +void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const float * op_params = (const float *)dst->op_params; + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = op_params[0]; + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, std::move(p)); +} + +void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, vk_op_unary_push_constants_init(src0, dst)); +} + +void ggml_vk_xielu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + float * op_params = (float *)dst->op_params; + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = op_params[1]; + p.param2 = op_params[2]; + p.param3 = op_params[3]; + p.param4 = op_params[4]; + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, std::move(p)); +} + +void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const float * op_params_f = (const float *)dst->op_params; + + const bool swapped = (bool)dst->op_params[1]; + const bool split = src1 != nullptr; + const float alpha = op_params_f[2]; + const float limit = op_params_f[3]; + + if (!split) { + GGML_ASSERT(src0->ne[0] / 2 == dst->ne[0]); + } else { + GGML_ASSERT(src0->ne[0] == src1->ne[0]); + GGML_ASSERT(src0->ne[0] == dst->ne[0]); + GGML_ASSERT(src0->type == src1->type); + } + + const uint32_t mode = split ? 2 : (swapped ? 1 : 0); + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = split ? ggml_type_size(src1->type) : src0_type_size; + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GLU, + { + (uint32_t)ggml_nelements(dst), + (uint32_t)src0->ne[0], + (uint32_t)dst->ne[0], + mode, + alpha, + limit, + (uint32_t)(src0->nb[0] / src0_type_size), + (uint32_t)(src0->nb[1] / src0_type_size), + (uint32_t)(src0->nb[2] / src0_type_size), + (uint32_t)(src0->nb[3] / src0_type_size), + (uint32_t)((split ? src1->nb[0] : src0->nb[0]) / src1_type_size), + (uint32_t)((split ? src1->nb[1] : src0->nb[1]) / src1_type_size), + (uint32_t)((split ? src1->nb[2] : src0->nb[2]) / src1_type_size), + (uint32_t)((split ? src1->nb[3] : src0->nb[3]) / src1_type_size), + (uint32_t)(dst->nb[0] / dst_type_size), + (uint32_t)(dst->nb[1] / dst_type_size), + (uint32_t)(dst->nb[2] / dst_type_size), + (uint32_t)(dst->nb[3] / dst_type_size), + (uint32_t)dst->ne[1], + (uint32_t)dst->ne[2], + 0, + 0, 0, 0, 0, 0, 0, + }); +} + +void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + int32_t * op_params = (int32_t *)dst->op_params; + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }); +} + +void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { + float * op_params = (float *)dst->op_params; + + float scale = op_params[0]; + float max_bias = op_params[1]; + + const uint32_t ncols = (uint32_t)src0->ne[0]; + const uint32_t nrows_x = (uint32_t)ggml_nrows(src0); + const uint32_t nrows_y = (uint32_t)src0->ne[1]; + + const uint32_t ne12 = src1 ? (uint32_t)(src1->ne[2]) : 0u; + const uint32_t ne13 = src1 ? (uint32_t)(src1->ne[3]) : 0u; + const uint32_t nb11 = src1 ? (uint32_t)(src1->nb[1] / src1->nb[0]) : 0u; + const uint32_t nb12 = src1 ? (uint32_t)(src1->nb[2] / src1->nb[0]) : 0u; + const uint32_t nb13 = src1 ? (uint32_t)(src1->nb[3] / src1->nb[0]) : 0u; + + const uint32_t n_head_kv = src0->ne[2]; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + vk_op_soft_max_push_constants pc { + ncols, + src1 != nullptr ? nrows_y : (uint32_t)0, + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], + ne12, ne13, + nb11, nb12, nb13, + scale, max_bias, + m0, m1, + n_head_log2, + nrows_x, + src2 != nullptr + }; + + if (ncols <= 16384) { + ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, std::move(pc)); + } else { + + vk_subbuffer buf_a = ggml_vk_tensor_subbuffer(ctx, src0); + vk_subbuffer buf_b = src1 ? ggml_vk_tensor_subbuffer(ctx, src1) : buf_a; + vk_subbuffer buf_c = src2 ? ggml_vk_tensor_subbuffer(ctx, src2) : buf_a; + vk_subbuffer buf_d = ggml_vk_tensor_subbuffer(ctx, dst); + + uint32_t elems_per_wg = 128 * 4; + uint32_t num_wgs = CEIL_DIV(ncols, elems_per_wg); + size_t tmp_size = num_wgs * nrows_x * sizeof(float); + + if (ctx->prealloc_size_x < tmp_size) { + ctx->prealloc_size_x = tmp_size; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if (ctx->prealloc_size_y < tmp_size) { + ctx->prealloc_size_y = tmp_size; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if (ctx->prealloc_x_need_sync || ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + + vk_subbuffer buf_x = { ctx->prealloc_x, 0, tmp_size }; + vk_subbuffer buf_y = { ctx->prealloc_y, 0, tmp_size }; + + std::array elements = { num_wgs, nrows_x, 1 }; + + vk_pipeline pipeline1 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large1_f32_f16 : ctx->device->pipeline_soft_max_large1_f32; + vk_pipeline pipeline2 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large2_f32_f16 : ctx->device->pipeline_soft_max_large2_f32; + vk_pipeline pipeline3 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large3_f32_f16 : ctx->device->pipeline_soft_max_large3_f32; + + ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline3, 1); + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); + ggml_vk_sync_buffers(ctx, subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); + ggml_vk_sync_buffers(ctx, subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline3, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); + + ctx->prealloc_x_need_sync = true; + ctx->prealloc_y_need_sync = true; + } +} + +void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + float * op_params = (float *)dst->op_params; + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1], 0.0f, 0.0f }); +} + +void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) { + topk_moe_mode mode = ctx->fused_topk_moe_mode; + ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0]; + ggml_tensor * bias = (mode == TOPK_MOE_SIGMOID_NORM_BIAS) ? cgraph->nodes[node_idx + 2]->src[1] : logits; + ggml_tensor * weights = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; + ggml_tensor * ids = (mode == TOPK_MOE_SIGMOID_NORM_BIAS) ? cgraph->nodes[node_idx + 4] : + (mode == TOPK_MOE_LATE_SOFTMAX) ? cgraph->nodes[node_idx + 1] : + cgraph->nodes[node_idx + 3]; + + GGML_ASSERT(logits->type == GGML_TYPE_F32); + GGML_ASSERT(bias->type == GGML_TYPE_F32); + GGML_ASSERT(weights->type == GGML_TYPE_F32); + GGML_ASSERT(ids->type == GGML_TYPE_I32); + + const int n_experts = logits->ne[0]; + const int n_rows = logits->ne[1]; + const int n_expert_used = weights->ne[1]; + + GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts); + + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, cgraph->nodes[node_idx], GGML_OP_SOFT_MAX); + + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + + vk_subbuffer logits_buf = ggml_vk_tensor_subbuffer(ctx, logits); + vk_subbuffer bias_buf = ggml_vk_tensor_subbuffer(ctx, bias); + vk_subbuffer weights_buf = ggml_vk_tensor_subbuffer(ctx, weights); + vk_subbuffer ids_buf = ggml_vk_tensor_subbuffer(ctx, ids); + + vk_op_topk_moe_push_constants pc {}; + pc.n_rows = n_rows; + pc.n_experts_push = n_experts; + pc.n_expert_used = n_expert_used; + pc.clamp_min = -std::numeric_limits::infinity(); + pc.clamp_max = std::numeric_limits::infinity(); + if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) { + ggml_tensor * clamp = cgraph->nodes[node_idx + 7]; + GGML_ASSERT(clamp->op == GGML_OP_CLAMP); + pc.clamp_min = ggml_get_op_params_f32(clamp, 0); + pc.clamp_max = ggml_get_op_params_f32(clamp, 1); + } + if (mode == TOPK_MOE_SIGMOID_NORM_BIAS) { + ggml_tensor * clamp = cgraph->nodes[node_idx + 8]; + GGML_ASSERT(clamp->op == GGML_OP_CLAMP); + pc.clamp_min = ggml_get_op_params_f32(clamp, 0); + pc.clamp_max = ggml_get_op_params_f32(clamp, 1); + } + +#define GATING_FUNC_SOFTMAX 0 +#define GATING_FUNC_SIGMOID 1 +#define GATING_FUNC_SOFTMAX_WEIGHT 2 + + pc.gating_func = mode == TOPK_MOE_SIGMOID_NORM_BIAS ? GATING_FUNC_SIGMOID : + mode == TOPK_MOE_LATE_SOFTMAX ? GATING_FUNC_SOFTMAX_WEIGHT : + GATING_FUNC_SOFTMAX; + pc.has_bias = mode == TOPK_MOE_SIGMOID_NORM_BIAS; + pc.with_norm = mode == TOPK_MOE_EARLY_SOFTMAX_NORM || mode == TOPK_MOE_SIGMOID_NORM_BIAS; + if (ctx->fused_topk_moe_scale) { + GGML_ASSERT(weights->op == GGML_OP_SCALE); + pc.output_scale = ggml_get_op_params_f32(weights, 0); + pc.output_bias = ggml_get_op_params_f32(weights, 1); + } else { + pc.output_scale = 1.0f; + pc.output_bias = 0.0f; + } + + GGML_ASSERT(n_expert_used <= n_experts); + + const uint32_t rows_per_block = 4; + std::array elements = { CEIL_DIV(n_rows, rows_per_block), 1, 1 }; + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {logits_buf, bias_buf, weights_buf, ids_buf}, pc, elements); +} + +void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const ggml_tensor * src2 = dst->src[2]; + const ggml_tensor * src3 = nullptr; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + // const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; + const float freq_base = ((float *) dst->op_params)[5]; + const float beta_fast = ((float *) dst->op_params)[9]; + const float beta_slow = ((float *) dst->op_params)[10]; + int sections[4] {}; + if (mode & GGML_ROPE_TYPE_MROPE) { + memcpy(sections, (int32_t *) dst->op_params + 11, sizeof(int)*4); + } + + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + uint32_t set_rows_stride = 0; + // Fused rope + view + set_rows passes the set_rows destination stride in set_rows_stride + // and overrides the dst and sets src3=row_indices + if (ctx->num_additional_fused_ops > 0) { + set_rows_stride = cgraph->nodes[node_idx + 2]->nb[1] / ggml_type_size(cgraph->nodes[node_idx + 2]->type); + src3 = cgraph->nodes[node_idx + 2]->src[1]; + dst = cgraph->nodes[node_idx + 2]; + } + + ggml_vk_op_f32(ctx, subctx, src0, src1, src2, src3, dst, GGML_OP_ROPE, + ggml_vk_make_rope_constants(cgraph->nodes[node_idx], src0, src2 != nullptr, backprop, set_rows_stride)); +} + +void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const uint32_t * op_params = (const uint32_t *)dst->op_params; + + uint32_t ncols = src0->ne[0]; + uint32_t nrows = ggml_nrows(src0); + + uint32_t ncols_pad_log2 = (uint32_t)ceilf(log2f(float(ncols))); + uint32_t ncolsp2 = 1 << ncols_pad_log2; + + vk_op_argsort_push_constants pc { ncols, ncolsp2, ncols_pad_log2, nrows, op_params[0], 0, 0, 0, 0, }; + + // Pick the largest workgroup size <= ncolsp2 + uint32_t pipeline_idx = std::min(ncols_pad_log2, num_argsort_pipelines - 1); + + // Use the "small" argsort shader if the whole sort can be done by a single workgroup. + bool use_small = ncols_pad_log2 <= ctx->device->max_workgroup_size_log2 && + ctx->device->pipeline_argsort_f32[pipeline_idx] != nullptr; + + vk_pipeline pipeline = use_small ? ctx->device->pipeline_argsort_f32[pipeline_idx] + : ctx->device->pipeline_argsort_large_f32[pipeline_idx]; + + vk_subbuffer src0_buf = ggml_vk_tensor_subbuffer(ctx, src0); + vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); + vk_subbuffer subbuf1 = dst_buf; + + // Reserve space for ivec2 per element, with rows padded to a power of two + if (!use_small) { + const size_t x_sz = size_t{ncolsp2} * nrows * 2 * sizeof(int); + + if (ctx->prealloc_size_x < x_sz) { + ctx->prealloc_size_x = x_sz; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if (ctx->prealloc_x_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + subbuf1 = { ctx->prealloc_x, 0, ctx->prealloc_x->size }; + } + + std::array elements; + + elements[0] = ncolsp2; + elements[1] = std::min((uint32_t)ggml_nrows(src0), ctx->device->properties.limits.maxComputeWorkGroupCount[1]); + elements[2] = 1; + + // First dispatch initializes tmp_idx and does the first N passes where + // there is only communication between threads in the same workgroup. + { + vk_op_argsort_push_constants pc2 = pc; + pc2.outer_start = 0; + pc2.outer_end = std::min(ncols_pad_log2, ctx->device->max_workgroup_size_log2); + pc2.inner_start = 0; + pc2.inner_end = 100; + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc2, elements); + } + if (!use_small) { + ggml_vk_sync_buffers(ctx, subctx); + // Loop over outer/inner passes, synchronizing between each pass. + for (uint32_t outer = ctx->device->max_workgroup_size_log2; outer < ncols_pad_log2; ++outer) { + for (uint32_t inner = 0; inner < outer + 1; ++inner) { + vk_op_argsort_push_constants pc2 = pc; + pc2.outer_start = outer; + pc2.outer_end = outer + 1; + pc2.inner_start = inner; + pc2.inner_end = inner + 1; + // When the inner idx is large enough, there's only communication + // within a workgroup. So the remaining inner iterations can all + // run in the same dispatch. + if (outer - inner < pipeline_idx) { + pc2.inner_end = 100; + inner = outer; + pipeline = ctx->device->pipeline_argsort_large_f32[pipeline_idx]; + } else { + // Smaller workgroup empirically seems to perform better + pipeline = ctx->device->pipeline_argsort_large_f32[pipeline_idx - 2]; + } + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc2, elements); + ggml_vk_sync_buffers(ctx, subctx); + } + } + ctx->prealloc_x_need_sync = true; + } +} + +void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + uint32_t ncols = src0->ne[0]; + uint32_t nrows = ggml_nrows(src0); + uint32_t k = dst->ne[0]; + + vk_op_topk_push_constants pc { ncols, ncols, ncols, k, nrows, 0, 0 }; + + if (ctx->prealloc_x_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + + std::array elements; + elements[1] = std::min(nrows, ctx->device->properties.limits.maxComputeWorkGroupCount[1]); + elements[2] = 1; + + uint32_t num_elements = ncols; + + // Each iteration reduces a workgroup's worth of elements down to the K + // largest elements. Repeat until we have the top K elements. + // Need to do at least one iteration to write out the results. + bool done_one_iter = false; + uint32_t dbl_buf_index = 0; + size_t dbl_buf_size; + while (num_elements > k || !done_one_iter) { + + // Prefer going as small as num_topk_pipelines - 3 for perf reasons. + // But if K is larger, then we need a larger workgroup + uint32_t max_pipeline = num_topk_pipelines - 1; + uint32_t preferred_pipeline = std::max(num_topk_pipelines - 3, (uint32_t)log2f(float(k)) + 2); + max_pipeline = std::min(preferred_pipeline, max_pipeline); + uint32_t min_pipeline = (uint32_t)log2f(float(k)) + 1; + // require full subgroup + min_pipeline = std::max(min_pipeline, ctx->device->subgroup_size_log2); + + uint32_t pipeline_idx = (uint32_t)ceilf(log2f(float(num_elements))); + pipeline_idx = std::min(pipeline_idx, max_pipeline); + pipeline_idx = std::max(pipeline_idx, min_pipeline); + + if (num_elements > (1u << pipeline_idx)) { + // If we could finish on this loop iteration (i.e. a single workgroup) + // then do so. It's better than the overhead of another pass. + for (uint32_t i = pipeline_idx; i < num_topk_pipelines; ++i) { + if (num_elements <= (1u << i)) { + pipeline_idx = i; + break; + } + } + } + + vk_pipeline pipeline = ctx->device->pipeline_topk_f32[pipeline_idx]; + // If the device doesn't support a pipeline this large, use smaller + while (!pipeline) { + pipeline_idx--; + GGML_ASSERT(pipeline_idx >= min_pipeline); + pipeline = ctx->device->pipeline_topk_f32[pipeline_idx]; + } + + vk_op_topk_push_constants pc2 = pc; + pc2.ncols_input = num_elements; + + // Number of elements remaining after this pass + uint32_t num_dst_elements = (num_elements / pipeline->wg_denoms[0]) * k + std::min(k, num_elements % pipeline->wg_denoms[0]); + + pc2.ncols_output = num_dst_elements; + + if (!done_one_iter) { + // Reserve space for ivec2 per element, double buffered + // K per workgroup per row + dbl_buf_size = num_dst_elements * nrows * 2 * sizeof(int); + dbl_buf_size = ROUNDUP_POW2(dbl_buf_size, ctx->device->properties.limits.minStorageBufferOffsetAlignment); + const size_t x_sz = dbl_buf_size * 2; + + if (ctx->prealloc_size_x < x_sz) { + ctx->prealloc_size_x = x_sz; + ggml_vk_preallocate_buffers(ctx, subctx); + } + } + + vk_subbuffer src_buf; + vk_subbuffer dst_buf; + + if (num_elements == ncols) { + pc2.first_pass = 1; + src_buf = ggml_vk_tensor_subbuffer(ctx, src0); + } else { + src_buf = { ctx->prealloc_x, dbl_buf_index * dbl_buf_size, dbl_buf_size }; + } + if (num_dst_elements == k) { + pc2.last_pass = 1; + dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); + } else { + dst_buf = { ctx->prealloc_x, (dbl_buf_index ^ 1) * dbl_buf_size, dbl_buf_size }; + } + + elements[0] = num_elements; + + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src_buf, dst_buf }, pc2, elements); + num_elements = num_dst_elements; + dbl_buf_index ^= 1; + if (num_elements > k) { + ggml_vk_sync_buffers(ctx, subctx); + } + done_one_iter = true; + } + ctx->prealloc_x_need_sync = true; +} + +void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, ggml_nelements(src0)); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM, p); +} + +void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, p); +} + +void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]); + p.weight = 1.0f / (float)src0->ne[0]; + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_MEAN, p); +} + +void ggml_vk_cumsum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + vk_op_sum_rows_push_constants pc = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]); + // Use the single pass shader when the rows are small or there are enough rows to fill the GPU. + // For fewer, larger rows, use the multipass shader to spread each row across SMs. + if (dst->ne[0] <= 4096 || ggml_nrows(dst) >= ctx->device->shader_core_count) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CUMSUM, pc); + return; + } + + // First pass computes partial sums within a block, and stores the last partial + // to the temp buffer. Second pass sums the block partials from the temp buffer + // and adds that to the result of the first pass. + vk_pipeline pipeline1 = ctx->device->pipeline_cumsum_multipass1_f32; + vk_pipeline pipeline2 = ctx->device->pipeline_cumsum_multipass2_f32; + GGML_ASSERT(pipeline1 != nullptr && pipeline2 != nullptr); + + ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1); + + std::array elements; + + elements[0] = dst->ne[0]; + elements[1] = (uint32_t)ggml_nrows(dst); + elements[2] = 1; + + size_t temp_size = sizeof(float) * elements[0] * ggml_nrows(dst); + + if (ctx->prealloc_size_split_k < temp_size) { + ctx->prealloc_size_split_k = temp_size; + ggml_vk_preallocate_buffers(ctx, subctx); + } + + vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src0); + vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); + vk_subbuffer temp_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0); + + if (ctx->prealloc_split_k_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, {src_buf, dst_buf, temp_buf}, pc, elements); + ggml_vk_sync_buffers(ctx, subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, {src_buf, dst_buf, temp_buf}, pc, elements); + + ctx->prealloc_split_k_need_sync = true; +} + +void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f, 0.0f, 0.0f }); +} + +void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_COUNT_EQUAL, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f }); +} + +void ggml_vk_solve_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOLVE_TRI, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, 0, + }); +} + +void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const int32_t s0 = dst->op_params[0]; + const int32_t s1 = dst->op_params[1]; + const int32_t p0 = dst->op_params[2]; + const int32_t p1 = dst->op_params[3]; + const int32_t d0 = dst->op_params[4]; + const int32_t d1 = dst->op_params[5]; + + const bool is_2D = dst->op_params[6] == 1; + + const uint32_t IC = src1->ne[is_2D ? 2 : 1]; + const uint32_t IH = is_2D ? src1->ne[1] : 1; + const uint32_t IW = src1->ne[0]; + + const uint32_t KH = is_2D ? src0->ne[1] : 1; + const uint32_t KW = src0->ne[0]; + + const uint32_t OH = is_2D ? dst->ne[2] : 1; + const uint32_t OW = dst->ne[1]; + + const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 + const uint32_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32 + + const uint32_t batch = src1->ne[is_2D ? 3 : 2]; + + const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + const vk_buffer d_buf = d_buf_ctx->dev_buffer; + + const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_offset(dst) + dst->view_offs; + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL, { + dst_addr, + batch_offset, offset_delta, + IC, IW, IH, OW, OH, KW, KH, + OH * batch, + IC * KH * KW, + s0, s1, p0, p1, d0, d1, batch * IC + }); +} + +void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_TENSOR_BINARY_OP_LOCALS + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t s2 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[3]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[4]; + const int32_t p2 = ((const int32_t *)(dst->op_params))[5]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[6]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[7]; + const int32_t d2 = ((const int32_t *)(dst->op_params))[8]; + const int32_t IC = ((const int32_t *)(dst->op_params))[9]; + + const int64_t N = ne13 / IC; + const int64_t ID = ne12; + const int64_t IH = ne11; + const int64_t IW = ne10; + + const int64_t KD = ne02; + const int64_t KH = ne01; + const int64_t KW = ne00; + + const int64_t OD = ne3 / N; + const int64_t OH = ne2; + const int64_t OW = ne1; + + const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + const vk_buffer d_buf = d_buf_ctx->dev_buffer; + + const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_offset(dst) + dst->view_offs; + + vk_op_im2col_3d_push_constants pc {}; + + pc.dst_addr = dst_addr; + pc.nb10 = nb10 / ggml_type_size(src1->type); + pc.nb11 = nb11 / ggml_type_size(src1->type); + pc.nb12 = nb12 / ggml_type_size(src1->type); + pc.nb13 = nb13 / ggml_type_size(src1->type); + pc.s0 = s0; + pc.s1 = s1; + pc.s2 = s2; + pc.p0 = p0; + pc.p1 = p1; + pc.p2 = p2; + pc.d0 = d0; + pc.d1 = d1; + pc.d2 = d2; + pc.IW = IW; + pc.IH = IH; + pc.ID = ID; + pc.IC = IC; + pc.KW = KW; + pc.OH = OH; + pc.KD_KH_KW = KD*KH*KW; + pc.KH_KW = KH*KW; + pc.IC_KD_KH_KW = IC*KD*KH*KW; + pc.N_OD_OH = N*OD*OH; + pc.OD_OH = OD*OH; + pc.OD_OH_OW_IC_KD_KH_KW = OD*OH*OW*IC*KD*KH*KW; + pc.OH_OW_IC_KD_KH_KW = OH*OW*IC*KD*KH*KW; + pc.OW_IC_KD_KH_KW = OW*IC*KD*KH*KW; + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL_3D, std::move(pc)); +} + +void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const uint32_t dim = dst->op_params[0]; + const uint32_t max_period = dst->op_params[1]; + const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, { + nb1, dim, max_period, + }); +} + +void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + // src0: (K, Cout, Cin, 1) -- kernel + // src1: (L, Cin, 1, 1) -- input + // dst: (*, Cout, 1, 1) + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + const int32_t s0 = dst->op_params[0]; + + vk_op_conv_transpose_1d_push_constants p{}; + p.Cout = static_cast(ne01); + p.Cin = static_cast(ne02); + p.K = static_cast(ne00); + p.L = static_cast(ne10); + p.KL = static_cast(ne0); + p.nb01 = static_cast(nb01 / nb00); + p.nb02 = static_cast(nb02 / nb00); + p.nb11 = static_cast(nb11 / nb10); + p.nb1 = static_cast(nb1 / nb0); + p.s0 = static_cast(s0); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p)); +} + +void ggml_vk_col2im_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + // src0: [K_OC, T_in] columns from matmul + // dst: [T_out, OC] + + const int32_t stride = dst->op_params[0]; + const int32_t oc = dst->op_params[1]; + const int32_t p0 = dst->op_params[2]; + + const uint32_t K_OC = static_cast(src0->ne[0]); + const uint32_t T_in = static_cast(src0->ne[1]); + const uint32_t T_out = static_cast(dst->ne[0]); + const uint32_t OC = static_cast(oc); + const uint32_t K = K_OC / OC; + + vk_op_col2im_1d_push_constants p{}; + p.T_out = T_out; + p.OC = OC; + p.K_OC = K_OC; + p.T_in = T_in; + p.K = K; + p.stride = stride; + p.p0 = p0; + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COL2IM_1D, std::move(p)); +} + +void ggml_vk_snake_dispatch_fused(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) { + const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0]; + const ggml_tensor * sqr = cgraph->nodes[node_idx + 2]; + const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3]; + ggml_tensor * add = cgraph->nodes[node_idx + 4]; + + // x carries the full activation shape, a is the broadcast operand + const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1]; + const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0]; + + // mul1 reads sqr and inv_b in either operand order + const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0]; + + vk_pipeline pipeline = nullptr; + switch (x->type) { + case GGML_TYPE_F32: pipeline = ctx->device->pipeline_snake_f32; break; + case GGML_TYPE_F16: pipeline = ctx->device->pipeline_snake_f16; break; + case GGML_TYPE_BF16: pipeline = ctx->device->pipeline_snake_bf16; break; + default: GGML_ABORT("unsupported type"); + } + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + + vk_subbuffer x_buf = ggml_vk_tensor_subbuffer(ctx, x); + vk_subbuffer a_buf = ggml_vk_tensor_subbuffer(ctx, a); + vk_subbuffer inv_b_buf = ggml_vk_tensor_subbuffer(ctx, inv_b); + vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, add); + + vk_op_snake_push_constants pc{}; + pc.ne0 = static_cast(x->ne[0]); + pc.ne1 = static_cast(x->ne[1]); + + std::array elements = { pc.ne0, pc.ne1, 1 }; + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { x_buf, a_buf, inv_b_buf, dst_buf }, pc, elements); +} + +void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + uint32_t op = static_cast(dst->op_params[0]); + const int32_t k1 = dst->op_params[1]; + const int32_t k0 = dst->op_params[2]; + const int32_t s1 = dst->op_params[3]; + const int32_t s0 = dst->op_params[4]; + const int32_t p1 = dst->op_params[5]; + const int32_t p0 = dst->op_params[6]; + + const uint32_t IH = src0->ne[1]; + const uint32_t IW = src0->ne[0]; + + const uint32_t N = dst->ne[3]; + + const uint32_t OC = dst->ne[2]; + const uint32_t OH = dst->ne[1]; + const uint32_t OW = dst->ne[0]; + + const uint32_t parallel_elements = N * OC * OH * OW; + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_POOL_2D, { + IW, IH, OW, OH, OC, + parallel_elements, + op, + k0, k1, s0, s1, p0, p1, + }); +} + +void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS + GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(nb0 == sizeof(float)); + + bool transpose = dst->op == GGML_OP_CONV_TRANSPOSE_2D; + + vk_op_conv2d_push_constants p{}; + p.Cout = static_cast(!transpose ? ne03 : ne02); + p.Cin = static_cast(!transpose ? ne02 : ne03); + p.N = static_cast(ne13); + GGML_ASSERT(p.Cout == ne2); + GGML_ASSERT(p.Cin == ne12); + + p.W = static_cast(ne10); + p.H = static_cast(ne11); + p.OW = static_cast(ne0); + p.OH = static_cast(ne1); + + p.nb01 = static_cast(nb01 / nb00); + p.nb02 = static_cast(nb02 / nb00); + p.nb03 = static_cast(nb03 / nb00); + + p.nb11 = static_cast(nb11 / nb10); + p.nb12 = static_cast(nb12 / nb10); + p.nb13 = static_cast(nb13 / nb10); + + p.nb1 = static_cast(nb1 / nb0); + p.nb2 = static_cast(nb2 / nb0); + p.nb3 = static_cast(nb3 / nb0); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, dst->op, std::move(p)); +} + +void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + vk_op_conv2d_dw_push_constants p{}; + p.ne = ggml_nelements(dst); + p.channels = dst->ne[2]; + p.batches = dst->ne[3]; + p.dst_w = dst->ne[0]; + p.dst_h = dst->ne[1]; + p.src_w = src1->ne[0]; + p.src_h = src1->ne[1]; + p.knl_w = src0->ne[0]; + p.knl_h = src0->ne[1]; + p.stride_x = dst->op_params[0]; + p.stride_y = dst->op_params[1]; + p.pad_x = dst->op_params[2]; + p.pad_y = dst->op_params[3]; + p.dilation_x = dst->op_params[4]; + p.dilation_y = dst->op_params[5]; + + GGML_ASSERT(src0->ne[3] == p.channels); + GGML_ASSERT(src1->ne[3] == p.batches); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D_DW, std::move(p)); +} + +void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const float * op_params = (const float *)dst->op_params; + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f, 0.0f, 0.0f }); +} + diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-pipelines.cpp b/ggml/src/ggml-vulkan/ggml-vulkan-pipelines.cpp new file mode 100644 index 0000000000..3ab4d29489 --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-pipelines.cpp @@ -0,0 +1,512 @@ +#include "ggml-vulkan-common.h" + +std::mutex queue_mutex; + +void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT + +uint64_t vk_tensor_offset(const ggml_tensor * tensor) { + if (tensor->view_src) { + return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base; + } + return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; +} + +uint32_t get_misalign_bytes(const ggml_backend_vk_context * ctx, const ggml_tensor * t) +{ + return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));; +} + +static VkDeviceSize ggml_vk_get_max_buffer_range(const ggml_backend_vk_context * ctx, const vk_buffer &buf, const VkDeviceSize offset) { + const VkDeviceSize range = std::min(VkDeviceSize{buf->size - offset}, + VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange}); + return range; +} + +void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) { + // Use waitForFences while most of the graph executes. Hopefully the CPU can sleep + // during this wait. + if (ctx->almost_ready_fence_pending) { + VK_CHECK(ctx->device->device.waitForFences({ ctx->almost_ready_fence }, true, UINT64_MAX), "almost_ready_fence"); + ctx->device->device.resetFences({ ctx->almost_ready_fence }); + ctx->almost_ready_fence_pending = false; + } + + // Spin (w/pause) waiting for the graph to finish executing. + vk::Result result; + while ((result = ctx->device->device.getFenceStatus(ctx->fence)) != vk::Result::eSuccess) { + if (result != vk::Result::eNotReady) { + fprintf(stderr, "ggml_vulkan: error %s at %s:%d\n", to_string(result).c_str(), __FILE__, __LINE__); + exit(1); + } + for (uint32_t i = 0; i < 100; ++i) { + YIELD(); + YIELD(); + YIELD(); + YIELD(); + YIELD(); + YIELD(); + YIELD(); + YIELD(); + YIELD(); + YIELD(); + } + } + ctx->device->device.resetFences({ ctx->fence }); +} + +void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) { + VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")"); + ctx->pipeline_descriptor_set_requirements += n; + if (!pipeline->compiled) { + ggml_vk_load_shaders(ctx->device, pipeline); + } + ggml_pipeline_allocate_descriptor_sets(ctx); +} + +void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) { + + if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) { + // Enough descriptors are available + return; + } + + vk_device& device = ctx->device; + + // Grow by 50% to avoid frequent allocations + uint32_t needed = std::max(3 * ctx->descriptor_sets.size() / 2, size_t{ctx->pipeline_descriptor_set_requirements}); + uint32_t to_alloc = needed - ctx->descriptor_sets.size(); + uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE; + uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE; + + while (to_alloc > 0) { + const uint32_t alloc_count = std::min(pool_remaining, to_alloc); + to_alloc -= alloc_count; + pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE; + + if (pool_idx >= ctx->descriptor_pools.size()) { + vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE); + vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size); + ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info)); + } + + std::vector layouts(alloc_count); + for (uint32_t i = 0; i < alloc_count; i++) { + layouts[i] = device->dsl; + } + vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data()); + std::vector sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info); + ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end()); + + pool_idx++; + } +} + +static vk_command_buffer* ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) { + VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()"); + vk::CommandBufferAllocateInfo command_buffer_alloc_info( + p.pool, + vk::CommandBufferLevel::ePrimary, + 1); + const std::vector cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info); + p.cmd_buffers.push_back({ cmd_buffers.front(), 0, true }); + return &p.cmd_buffers[p.cmd_buffers.size()-1]; +} + +void ggml_vk_submit(vk_context& ctx, vk::Fence fence) { + if (ctx->seqs.empty()) { + if (fence) { + std::lock_guard guard(queue_mutex); + ctx->p->q->queue.submit({}, fence); + } + return; + } + VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")"); + + std::vector> tl_wait_vals; + std::vector> tl_signal_vals; + std::vector> tl_wait_semaphores; + std::vector> tl_signal_semaphores; + std::vector tl_submit_infos; + std::vector submit_infos; + int idx = -1; + std::vector> stage_flags; + + size_t reserve = 0; + + for (const auto& sequence : ctx->seqs) { + reserve += sequence.size(); + } + + // Pre-reserve vectors to prevent reallocation, which invalidates pointers + tl_wait_semaphores.reserve(reserve); + tl_wait_vals.reserve(reserve); + tl_signal_semaphores.reserve(reserve); + tl_signal_vals.reserve(reserve); + tl_submit_infos.reserve(reserve); + submit_infos.reserve(reserve); + stage_flags.reserve(reserve); + + for (const auto& sequence : ctx->seqs) { + for (const auto& submission : sequence) { + stage_flags.push_back({}); + idx++; + tl_wait_vals.push_back({}); + tl_wait_semaphores.push_back({}); + tl_signal_vals.push_back({}); + tl_signal_semaphores.push_back({}); + for (size_t i = 0; i < submission.wait_semaphores.size(); i++) { + stage_flags[idx].push_back(ctx->p->q->stage_flags); + tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value); + tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s); + } + for (size_t i = 0; i < submission.signal_semaphores.size(); i++) { + tl_signal_vals[idx].push_back(submission.signal_semaphores[i].value); + tl_signal_semaphores[idx].push_back(submission.signal_semaphores[i].s); + } + tl_submit_infos.push_back({ + (uint32_t) submission.wait_semaphores.size(), + tl_wait_vals[idx].data(), + (uint32_t) submission.signal_semaphores.size(), + tl_signal_vals[idx].data(), + }); + tl_submit_infos[idx].sType = vk::StructureType::eTimelineSemaphoreSubmitInfo; + tl_submit_infos[idx].pNext = nullptr; + vk::SubmitInfo si{ + (uint32_t) submission.wait_semaphores.size(), + tl_wait_semaphores[idx].data(), + stage_flags[idx].data(), + 1, + &submission.buffer->buf, + (uint32_t) submission.signal_semaphores.size(), + tl_signal_semaphores[idx].data(), + }; + si.setPNext(&tl_submit_infos[idx]); + submit_infos.push_back(si); + } + } + + std::lock_guard guard(queue_mutex); + ctx->p->q->queue.submit(submit_infos, fence); + + ctx->seqs.clear(); +} + +uint32_t ggml_vk_find_queue_family_index(std::vector& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) { + VK_LOG_DEBUG("ggml_vk_find_queue_family_index()"); + const uint32_t qfsize = queue_family_props.size(); + + // Try with avoid preferences first + for (uint32_t i = 0; i < qfsize; i++) { + if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required && !(queue_family_props[i].queueFlags & avoid)) { + return i; + } + } + + // Fall back to only required + for (size_t i = 0; i < qfsize; i++) { + if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required) { + return i; + } + } + + // Fall back to reusing compute queue + for (size_t i = 0; i < qfsize; i++) { + if (queue_family_props[i].queueCount >= min_num_queues && queue_family_props[i].queueFlags & required) { + return i; + } + } + + // Fall back to ignoring min_num_queries + for (size_t i = 0; i < qfsize; i++) { + if (queue_family_props[i].queueFlags & required) { + return i; + } + } + + // All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations. + // Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional. + if (compute_index >= 0) { + return compute_index; + } + + std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl; + + for(auto &q_family : queue_family_props) { + std::cerr << "Queue number: " + std::to_string(q_family.queueCount) << " flags: " + to_string(q_family.queueFlags) << std::endl; + } + abort(); +} + +void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags, bool transfer_only) { + VK_LOG_DEBUG("ggml_vk_create_queue()"); + std::lock_guard guard(device->mutex); + + q.queue_family_index = queue_family_index; + q.transfer_only = transfer_only; + + q.cmd_pool.init(device, &q); + + q.queue = device->device.getQueue(queue_family_index, queue_index); + + q.stage_flags = stage_flags; +} + +vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) { + vk_context result = std::make_shared(); + VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")"); + ctx->gc.contexts.emplace_back(result); + result->p = &p; + return result; +} + +vk_context ggml_vk_create_temporary_context(vk_command_pool& p) { + vk_context result = std::make_shared(); + VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")"); + result->p = &p; + return result; +} + +static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) { + VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()"); + vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 }; + vk::SemaphoreCreateInfo ci{}; + ci.setPNext(&tci); + vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci); + ctx->gc.semaphores.push_back({ semaphore, 0 }); + return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1]; +} + +static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) { + VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()"); + if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) { + vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; + vk::SemaphoreCreateInfo ci{}; + ci.setPNext(&tci); + vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci); + ctx->gc.tl_semaphores.push_back({ semaphore, 0 }); + } + return &ctx->gc.tl_semaphores[ctx->semaphore_idx++]; +} + +static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) { + if (ctx->event_idx >= ctx->gc.events.size()) { + ctx->gc.events.push_back(ctx->device->device.createEvent({})); + } + return ctx->gc.events[ctx->event_idx++]; +} + +void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) { + VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()"); + + // Requires command buffers to be done + device->device.resetCommandPool(p.pool); + // Don't clear the command buffers and mark them as not in use. + // This allows us to reuse them + for (auto& cmd_buffer : p.cmd_buffers) { + cmd_buffer.in_use = false; + } +} + +void ggml_vk_queue_command_pools_cleanup(vk_device& device) { + VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()"); + + // Arbitrary frequency to cleanup/reuse command buffers + static constexpr uint32_t cleanup_frequency = 10; + + if (device->compute_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) { + ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool); + } + if (device->transfer_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) { + ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool); + } +} + +vk_subbuffer ggml_vk_subbuffer(const ggml_backend_vk_context* ctx, const vk_buffer& buf, size_t offset) { + return { buf, offset, ggml_vk_get_max_buffer_range(ctx, buf, offset) }; +} + +void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx) { + VK_LOG_DEBUG("ggml_vk_sync_buffers()"); + + const bool transfer_queue = subctx->p->q->transfer_only; + + if (ctx) { + ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false; + } + + subctx->s->buffer->buf.pipelineBarrier( + subctx->p->q->stage_flags, + subctx->p->q->stage_flags, + {}, + { { + { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) }, + { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) } + } }, + {}, + {} + ); +} + +static void ggml_vk_reset_event(vk_context& ctx, vk::Event& event) { + VK_LOG_DEBUG("ggml_vk_set_event()"); + + ctx->s->buffer->buf.resetEvent( + event, + ctx->p->q->stage_flags + ); +} + +void ggml_vk_set_event(vk_context& ctx, vk::Event& event) { + VK_LOG_DEBUG("ggml_vk_set_event()"); + + ctx->s->buffer->buf.setEvent( + event, + ctx->p->q->stage_flags + ); +} + +void ggml_vk_wait_events(vk_context& ctx, std::vector&& events) { + VK_LOG_DEBUG("ggml_vk_wait_events()"); + if (events.empty()) { + return; + } + + ctx->s->buffer->buf.waitEvents( + events, + ctx->p->q->stage_flags, + ctx->p->q->stage_flags, + {}, + {}, + {} + ); +} + +vk_subbuffer ggml_vk_tensor_subbuffer( + const ggml_backend_vk_context * ctx, const ggml_tensor * tensor, bool allow_misalign) { + + vk_buffer buffer = nullptr; + size_t offset = 0; + if (ctx->device->uma) { + ggml_vk_host_get(ctx->device, tensor->data, buffer, offset); + } + if (!buffer) { + auto buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; + buffer = buf_ctx->dev_buffer; + offset = vk_tensor_offset(tensor) + tensor->view_offs; + } + GGML_ASSERT(buffer != nullptr); + + size_t size = ggml_nbytes(tensor); + + size_t misalign_bytes = offset & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1); + // The shader must support misaligned offsets when indexing into the buffer + GGML_ASSERT(allow_misalign || misalign_bytes == 0); + offset &= ~misalign_bytes; + size += misalign_bytes; + + return vk_subbuffer{buffer, offset, size}; +} + +static vk_command_buffer* ggml_vk_get_or_create_cmd_buffer(vk_device& device, vk_command_pool& pool) { + for (auto& cmd_buffer : pool.cmd_buffers) { + if (!cmd_buffer.in_use) { + cmd_buffer.use_counter++; + cmd_buffer.in_use = true; + return &cmd_buffer; + } + } + return ggml_vk_create_cmd_buffer(device, pool); +} + +static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) { + vk_submission s; + s.buffer = ggml_vk_get_or_create_cmd_buffer(device, p); + if (one_time) { + s.buffer->buf.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); + } else { + s.buffer->buf.begin({ vk::CommandBufferUsageFlags{} }); + } + + return s; +} + +void ggml_vk_ctx_end(vk_context& ctx) { + VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")"); + if (ctx->s == nullptr) { + return; + } + + ctx->s->buffer->buf.end(); + ctx->s = nullptr; +} + +void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) { + VK_LOG_DEBUG("ggml_vk_ctx_begin(" << device->name << ")"); + if (subctx->s != nullptr) { + ggml_vk_ctx_end(subctx); + } + + subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) }); + subctx->s = subctx->seqs[subctx->seqs.size() - 1].data(); +} + +vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx) { + vk_context result; + if (!ctx->compute_ctx.expired()) { + result = ctx->compute_ctx.lock(); + } else { + result = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); + + ctx->compute_ctx = result; + ggml_vk_ctx_begin(ctx->device, result); + } + + if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) { + result->s->wait_semaphores.push_back(ctx->transfer_semaphore); + ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value; + } + + return result; +} + +bool ggml_vk_submit_transfer_ctx(ggml_backend_vk_context * ctx) { + if (!ctx->device->async_use_transfer_queue || ctx->transfer_ctx.expired()) { + return false; + } + + vk_context cpy_ctx = ctx->transfer_ctx.lock(); + ggml_vk_ctx_end(cpy_ctx); + + for (auto& cpy : cpy_ctx->in_memcpys) { + memcpy(cpy.dst, cpy.src, cpy.n); + } + + ctx->transfer_semaphore.value++; + cpy_ctx->seqs.back().back().signal_semaphores.push_back(ctx->transfer_semaphore); + + ggml_vk_submit(cpy_ctx, {}); + ctx->transfer_ctx.reset(); + return true; +} + +size_t ggml_vk_align_size(size_t width, size_t align) { + VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")"); + return CEIL_DIV(width, align) * align; +} + +void deferred_memcpy(void * dst, const void * src, size_t size, std::vector* memcpys) { + if (memcpys == nullptr) { + memcpy(dst, src, size); + } else { + memcpys->emplace_back(dst, src, size); + } +} + +void deferred_memset(void * dst, uint32_t val, size_t size, std::vector* memsets) { + if (memsets == nullptr) { + memset(dst, val, size); + } else { + memsets->emplace_back(dst, val, size); + } +} + diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-push-constants.h b/ggml/src/ggml-vulkan/ggml-vulkan-push-constants.h new file mode 100644 index 0000000000..7d0dc642c9 --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-push-constants.h @@ -0,0 +1,872 @@ +#pragma once +#include "ggml-vulkan-types.h" + +uint32_t get_misalign_bytes(const ggml_backend_vk_context * ctx, const ggml_tensor * t); + +struct vk_mat_mat_push_constants { + uint32_t M; uint32_t N; uint32_t K; + uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; + uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; + uint32_t base_work_group_z; uint32_t num_batches; + uint32_t k_split; + uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3; + uint32_t padded_N; +}; + +struct vk_mat_vec_push_constants { + uint32_t ncols; + uint32_t stride_a; + uint32_t stride_b; + uint32_t stride_d; + uint32_t batch_stride_a; + uint32_t batch_stride_b; + uint32_t batch_stride_d; + uint32_t fusion_flags; + uint32_t base_work_group_y; + uint32_t ne02; + uint32_t ne12; + uint32_t broadcast2; + uint32_t broadcast3; +}; + +struct vk_mat_vec_p021_push_constants { + uint32_t ncols_x; + uint32_t nrows_x; + uint32_t nchannels_x; + uint32_t nchannels_y; + uint32_t b_offset; + uint32_t d_offset; + uint32_t fusion_flags; +}; + +struct vk_mat_vec_nc_push_constants { + uint32_t ncols_x; + uint32_t nrows_x; + uint32_t row_stride_x; + uint32_t channel_stride_x; + uint32_t channel_stride_y; + uint32_t channel_x_divisor; + uint32_t ne12; + uint32_t b_offset; + uint32_t d_offset; + uint32_t nb03; + uint32_t nb13; + uint32_t nb23; + uint32_t fusion_flags; +}; + +struct vk_mat_mat_id_push_constants { + uint32_t M; uint32_t N; uint32_t K; + uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; + uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; + uint32_t nei0; uint32_t nei1; uint32_t nbi1; uint32_t ne11; + uint32_t padded_N; +}; + +struct vk_mat_vec_id_push_constants { + uint32_t ncols; + uint32_t stride_a; + uint32_t stride_b; + uint32_t stride_d; + uint32_t batch_stride_a; + uint32_t batch_stride_b; + uint32_t batch_stride_d; + uint32_t fusion_flags; + uint32_t nei0; + uint32_t ne11; + uint32_t expert_i1; + uint32_t nbi1; +}; + +struct vk_flash_attn_push_constants { + uint32_t N; + uint32_t KV; + + uint32_t ne1; + uint32_t ne2; + uint32_t ne3; + + uint32_t neq2; + uint32_t neq3; + uint32_t nek2; + uint32_t nek3; + uint32_t nev2; + uint32_t nev3; + uint32_t nem1; + uint32_t nem2; + uint32_t nem3; + + uint32_t nb01; + uint32_t nb02; + uint32_t nb03; + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; + uint32_t nb21; + uint32_t nb22; + uint32_t nb23; + + float scale; + float max_bias; + float logit_softcap; + + uint32_t mask_n_head_log2; + float m0; + float m1; + + uint32_t gqa_ratio; + uint32_t split_kv; + uint32_t k_num; +}; + +static_assert(sizeof(vk_flash_attn_push_constants) <= 128, "sizeof(vk_flash_attn_push_constants) must be <= 128"); + +struct vk_op_push_constants { + uint32_t KX; + uint32_t KY; + float param1; + float param2; + float param3; + float param4; +}; + +struct vk_op_fwht_push_constants { + uint32_t n_rows; + uint32_t src_offset; + uint32_t dst_offset; + float scale; +}; + +struct vk_op_count_experts_push_constants { + uint32_t ne00; + uint32_t ne01; + uint32_t nb00; + uint32_t nb01; + uint32_t a_offset; +}; + +struct vk_op_glu_push_constants { + uint32_t N; + uint32_t ne00; + uint32_t ne20; + uint32_t mode; // 0: default, 1: swapped, 2: split + float alpha; // for swiglu_oai + float limit; + uint32_t nb00; + uint32_t nb01; + uint32_t nb02; + uint32_t nb03; + uint32_t nb10; + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; + uint32_t nb20; + uint32_t nb21; + uint32_t nb22; + uint32_t nb23; + uint32_t ne21; + uint32_t ne22; + uint32_t misalign_offsets; + uint32_t ne2_012mp; uint32_t ne2_012L; + uint32_t ne2_01mp; uint32_t ne2_01L; + uint32_t ne2_0mp; uint32_t ne2_0L; +}; + +static_assert(sizeof(vk_op_glu_push_constants) <= 128, "sizeof(vk_op_glu_push_constants) must be <= 128"); + +struct vk_op_unary_push_constants { + uint32_t ne; + uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; + uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; + uint32_t misalign_offsets; + float param1; float param2; float param3; float param4; + uint32_t ne0_012mp; uint32_t ne0_01mp; uint32_t ne0_0mp; uint32_t ne0_Ls; + uint32_t ne1_012mp; uint32_t ne1_01mp; uint32_t ne1_0mp; uint32_t ne1_Ls; +}; + +static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128"); + +static vk_op_unary_push_constants vk_op_unary_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst, int64_t ne = 0) { + GGML_ASSERT(ne != 0 || (ggml_nelements(src0) == ggml_nelements(dst))); + ne = ne != 0 ? ne : ggml_nelements(dst); + GGML_ASSERT(ne <= (int64_t)std::numeric_limits::max()); + + vk_op_unary_push_constants p{}; + p.ne = (uint32_t)ne; + + size_t src0_tsize = ggml_type_size(src0->type); + p.ne00 = (uint32_t)src0->ne[0]; + p.ne01 = (uint32_t)src0->ne[1]; + p.ne02 = (uint32_t)src0->ne[2]; + p.ne03 = (uint32_t)src0->ne[3]; + p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize); + p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize); + p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize); + p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize); + + size_t dst_tsize = ggml_type_size(dst->type); + p.ne10 = (uint32_t)dst->ne[0]; + p.ne11 = (uint32_t)dst->ne[1]; + p.ne12 = (uint32_t)dst->ne[2]; + p.ne13 = (uint32_t)dst->ne[3]; + p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize); + p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize); + p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize); + p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize); + + return p; // offsets are initialized later in ggml_vk_op +} + +struct vk_op_pad_push_constants { + uint32_t ne; + uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; + uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; + uint32_t misalign_offsets; + uint32_t circular; + + uint32_t lp0; uint32_t rp0; + uint32_t lp1; uint32_t rp1; + uint32_t lp2; uint32_t rp2; + uint32_t lp3; uint32_t rp3; +}; + +static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst) { + int64_t ne = ggml_nelements(dst); + GGML_ASSERT(ne <= (int64_t)std::numeric_limits::max()); + + vk_op_pad_push_constants p{}; + p.ne = (uint32_t)ne; + + size_t src0_tsize = ggml_type_size(src0->type); + p.ne00 = (uint32_t)src0->ne[0]; + p.ne01 = (uint32_t)src0->ne[1]; + p.ne02 = (uint32_t)src0->ne[2]; + p.ne03 = (uint32_t)src0->ne[3]; + p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize); + p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize); + p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize); + p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize); + + size_t dst_tsize = ggml_type_size(dst->type); + p.ne10 = (uint32_t)dst->ne[0]; + p.ne11 = (uint32_t)dst->ne[1]; + p.ne12 = (uint32_t)dst->ne[2]; + p.ne13 = (uint32_t)dst->ne[3]; + p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize); + p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize); + p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize); + p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize); + + p.lp0 = dst->op_params[0]; + p.rp0 = dst->op_params[1]; + p.lp1 = dst->op_params[2]; + p.rp1 = dst->op_params[3]; + p.lp2 = dst->op_params[4]; + p.rp2 = dst->op_params[5]; + p.lp3 = dst->op_params[6]; + p.rp3 = dst->op_params[7]; + p.circular = dst->op_params[8]; + + return p; // fastdiv values and offsets are initialized later in ggml_vk_op +} + +static void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) +{ + // compute L = ceil(log2(d)); + L = 0; + while (L < 32 && (uint32_t{1} << L) < d) { + L++; + } + + mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1); +} + +static uint32_t pack_fastdiv_L(uint32_t L0, uint32_t L1, uint32_t L2) { + return L0 | (L1 << 8) | (L2 << 16); +} + +template void init_pushconst_fastdiv(T &p) { + GGML_UNUSED(p); + static_assert(!std::is_const::value, "unexpected type"); +} + +template <> inline void init_pushconst_fastdiv(vk_op_unary_push_constants &p) { + // Compute magic values to divide by these six numbers. + uint32_t ne0_012L; + uint32_t ne0_01L; + uint32_t ne0_0L; + uint32_t ne1_012L; + uint32_t ne1_01L; + uint32_t ne1_0L; + + init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, ne0_012L); + init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, ne0_01L); + init_fastdiv_values(p.ne00, p.ne0_0mp, ne0_0L); + init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, ne1_012L); + init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, ne1_01L); + init_fastdiv_values(p.ne10, p.ne1_0mp, ne1_0L); + + p.ne0_Ls = pack_fastdiv_L(ne0_012L, ne0_01L, ne0_0L); + p.ne1_Ls = pack_fastdiv_L(ne1_012L, ne1_01L, ne1_0L); +} + +template <> inline void init_pushconst_fastdiv(vk_op_glu_push_constants &p) { + // GLU linearizes over dst, then uses dst coordinates for src0/src1. + init_fastdiv_values(p.ne22*p.ne21*p.ne20, p.ne2_012mp, p.ne2_012L); + init_fastdiv_values(p.ne21*p.ne20, p.ne2_01mp, p.ne2_01L); + init_fastdiv_values(p.ne20, p.ne2_0mp, p.ne2_0L); +} + +struct vk_op_binary_push_constants { + uint32_t ne; + uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; + uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; + uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23; + uint32_t misalign_offsets; + float param1; float param2; int32_t param3; +}; + +struct vk_op_multi_add_push_constants { + // shape for dst + uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; + + // strides for srcs+dst + uint32_t nb[MAX_PARAMETER_COUNT][4]; + + uint32_t rms_partials; +}; + +static_assert(MAX_PARAMETER_COUNT == 12); + +static_assert(sizeof(vk_op_multi_add_push_constants) <= 256); + +struct vk_op_topk_moe_push_constants { + uint32_t n_rows; + uint32_t n_experts_push; + uint32_t n_expert_used; + float clamp_min; + float clamp_max; + uint32_t gating_func; + uint32_t has_bias; + uint32_t with_norm; + float output_scale; + float output_bias; +}; + +struct vk_op_add_id_push_constants { + uint32_t ne0; + uint32_t ne1; + uint32_t s01; + uint32_t s02; + uint32_t s11; + uint32_t s21; +}; + +struct vk_op_diag_mask_push_constants { + uint32_t ncols; + uint32_t rows_per_channel; + int32_t n_past; +}; + +struct vk_op_rope_push_constants { + uint32_t rope_mode; + uint32_t nrows; + uint32_t n_dims; + float freq_scale; + float freq_base; + float ext_factor; + float attn_factor; + float corr_dims[2]; + float theta_scale; + uint32_t has_ff; + int32_t sections[4]; + uint32_t is_imrope; + uint32_t is_back; + uint32_t set_rows_stride; + uint32_t ne00; + uint32_t ne01; + uint32_t ne02; + uint32_t nb01; + uint32_t nb02; + uint32_t nb03; + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; + uint32_t a_offset; + uint32_t d_offset; +}; + +static_assert(sizeof(vk_op_rope_push_constants) <= 128, "sizeof(vk_op_rope_push_constants) must be <= 128"); + +struct vk_op_rms_norm_mul_rope_push_constants { + vk_op_binary_push_constants bin; + vk_op_rope_push_constants rope; +}; + +struct vk_op_soft_max_push_constants { + uint32_t KX; + uint32_t KY; + uint32_t ne00; + uint32_t ne01; + uint32_t ne02; + uint32_t ne12; + uint32_t ne13; + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; + float scale; + float max_bias; + float m0; + float m1; + uint32_t n_head_log2; + uint32_t nrows_x; + uint32_t has_sinks; +}; + +struct vk_op_argsort_push_constants { + uint32_t ncols; + uint32_t ncols_padded; + uint32_t ncols_padded_log2; + uint32_t nrows; + uint32_t order; + uint32_t outer_start; + uint32_t outer_end; + uint32_t inner_start; + uint32_t inner_end; +}; + +struct vk_op_topk_push_constants { + uint32_t orig_ncols; + uint32_t ncols_input; + uint32_t ncols_output; + uint32_t k; + uint32_t nrows; + uint32_t first_pass; + uint32_t last_pass; +}; + +struct vk_op_im2col_push_constants { + uint64_t dst_addr; + uint32_t batch_offset; uint32_t offset_delta; + uint32_t IC; + uint32_t IW; uint32_t IH; + uint32_t OW; uint32_t OH; + uint32_t KW; uint32_t KH; + uint32_t OH_batch; + uint32_t CHW; + int32_t s0; int32_t s1; + int32_t p0; int32_t p1; + int32_t d0; int32_t d1; + uint32_t batch_IC; +}; + +struct vk_op_im2col_3d_push_constants { + uint64_t dst_addr; + uint32_t nb10; + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; + uint32_t s0; + uint32_t s1; + uint32_t s2; + uint32_t p0; + uint32_t p1; + uint32_t p2; + uint32_t d0; + uint32_t d1; + uint32_t d2; + uint32_t IW; + uint32_t IH; + uint32_t ID; + uint32_t IC; + uint32_t KW; + uint32_t OH; + uint32_t KD_KH_KW; + uint32_t KH_KW; + uint32_t IC_KD_KH_KW; + uint32_t N_OD_OH; + uint32_t OD_OH; + uint32_t OD_OH_OW_IC_KD_KH_KW; + uint32_t OH_OW_IC_KD_KH_KW; + uint32_t OW_IC_KD_KH_KW; + uint32_t misalign_offsets; +}; + +struct vk_op_timestep_embedding_push_constants { + uint32_t nb1; + uint32_t dim; + uint32_t max_period; +}; + +struct vk_op_col2im_1d_push_constants { + uint32_t T_out; + uint32_t OC; + uint32_t K_OC; + uint32_t T_in; + uint32_t K; + int32_t stride; + int32_t p0; +}; + +struct vk_op_conv_transpose_1d_push_constants { + uint32_t Cout; + uint32_t Cin; + uint32_t K; + uint32_t L; + uint32_t KL; + + uint32_t nb01; + uint32_t nb02; + uint32_t nb11; + uint32_t nb1; + + int32_t s0; +}; + +struct vk_op_snake_push_constants { + uint32_t ne0; + uint32_t ne1; +}; + +struct vk_op_pool2d_push_constants { + uint32_t IW; uint32_t IH; + uint32_t OW; uint32_t OH; + uint32_t OC; + uint32_t pelements; + uint32_t op; + int32_t k0; int32_t k1; + int32_t s0; int32_t s1; + int32_t p0; int32_t p1; +}; + +struct vk_op_rwkv_wkv6_push_constants { + uint32_t B; + uint32_t T; + uint32_t C; + uint32_t H; +}; + +struct vk_op_rwkv_wkv7_push_constants { + uint32_t B; + uint32_t T; + uint32_t C; + uint32_t H; +}; + +struct vk_op_gated_delta_net_push_constants { + uint32_t H; + uint32_t n_tokens; + uint32_t n_seqs; + uint32_t s_off; + uint32_t sq1, sq2, sq3; + uint32_t sv1, sv2, sv3; + uint32_t sb1, sb2, sb3; + uint32_t neq1, rq3; + float scale; + uint32_t K; +}; + +struct vk_op_ssm_scan_push_constants { + uint32_t nb02, nb03, nb12, nb13; + uint32_t nb21, nb22, nb31; + uint32_t nb42, nb43, nb52, nb53; + uint32_t s_off; + uint32_t n_head, d_head, n_group, n_tok; +}; + +struct vk_op_ssm_conv_push_constants { + uint32_t nb01, nb02; + uint32_t nb11; + uint32_t dst_nb0, dst_nb1, dst_nb2; + uint32_t nc, ncs, nr, n_t, n_s; +}; + +struct vk_op_conv2d_push_constants { + uint32_t Cout; + uint32_t Cin; + uint32_t N; + + uint32_t W; + uint32_t H; + uint32_t OW; + uint32_t OH; + + uint32_t nb01; + uint32_t nb02; + uint32_t nb03; + + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; + + uint32_t nb1; + uint32_t nb2; + uint32_t nb3; + + // init_fastdiv_values constants for dividing by OW, OW*OH + uint32_t OWmp; uint32_t OWL; + uint32_t OWOHmp; uint32_t OWOHL; +}; + +template <> inline void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) { + // Compute magic values to divide by OW, OW*OH + init_fastdiv_values(p.OW, p.OWmp, p.OWL); + init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL); +} + +struct vk_op_conv2d_dw_push_constants { + uint32_t ne; + uint32_t batches; + uint32_t channels; + uint32_t dst_w; + uint32_t dst_h; + uint32_t src_w; + uint32_t src_h; + uint32_t knl_w; + uint32_t knl_h; + int32_t stride_x; + int32_t stride_y; + int32_t pad_x; + int32_t pad_y; + int32_t dilation_x; + int32_t dilation_y; +}; + +struct vk_op_upscale_push_constants { + uint32_t ne; uint32_t a_offset; uint32_t d_offset; + uint32_t ne00; uint32_t ne01; + uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; + uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; + float sf0; float sf1; float sf2; float sf3; + float pixel_offset; +}; + +struct vk_op_sum_rows_push_constants +{ + uint32_t n_cols; + uint32_t ne01, ne02; + uint32_t nb01, nb02, nb03; + uint32_t nb11, nb12, nb13; + float weight; + uint32_t misalign_offsets; + uint32_t ne0_12mp, ne0_12L; + uint32_t ne0_1mp, ne0_1L; +}; + +static vk_op_sum_rows_push_constants vk_op_sum_rows_push_constants_init(const ggml_tensor * src, const ggml_tensor * dst, int64_t n_cols) { + uint32_t type_size = (uint32_t)ggml_type_size(src->type); + vk_op_sum_rows_push_constants p = {}; + p.n_cols = (uint32_t)n_cols; + p.ne01 = (uint32_t)src->ne[1]; + p.ne02 = (uint32_t)src->ne[2]; + p.nb01 = (uint32_t)src->nb[1] / type_size; + p.nb02 = (uint32_t)src->nb[2] / type_size; + p.nb03 = (uint32_t)src->nb[3] / type_size; + p.nb11 = (uint32_t)dst->nb[1] / type_size; + p.nb12 = (uint32_t)dst->nb[2] / type_size; + p.nb13 = (uint32_t)dst->nb[3] / type_size; + p.weight = 1.0f; + return p; +} + +template <> inline void init_pushconst_fastdiv(vk_op_sum_rows_push_constants &p) { + init_fastdiv_values(p.ne01*p.ne02, p.ne0_12mp, p.ne0_12L); + init_fastdiv_values(p.ne01, p.ne0_1mp, p.ne0_1L); +} + +struct vk_quantize_q8_1_push_constants { + uint32_t ne; + uint32_t num_blocks; +}; + +struct vk_op_flash_attn_split_k_reduce_push_constants { + uint32_t D; + uint32_t ne1; + uint32_t ne2; + uint32_t ne3; + uint32_t k_num; + uint32_t sinks; +}; + +struct vk_op_flash_attn_mask_opt_push_constants { + uint32_t nem0; + uint32_t nem1; + uint32_t nem2; + uint32_t nbm1; + uint32_t nbm2; + uint32_t nbm3; + uint32_t nbd1; + uint32_t nbd2; + uint32_t nbd3; +}; + +template void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + GGML_UNUSED(p); + GGML_UNUSED(src0); + GGML_UNUSED(src1); + GGML_UNUSED(src2); + GGML_UNUSED(src3); + GGML_UNUSED(dst); + static_assert(!std::is_const::value, "unexpected type"); + GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0); + GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0); + GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0); + GGML_ASSERT(!src3 || get_misalign_bytes(ctx, src3) == 0); + GGML_ASSERT(!dst || get_misalign_bytes(ctx, dst) == 0); +} + +template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_mat_vec_p021_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + p.b_offset = b_offset; + p.d_offset = d_offset; + + GGML_UNUSED(src0); + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + +template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_mat_vec_nc_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + p.b_offset = b_offset; + p.d_offset = d_offset; + + GGML_UNUSED(src0); + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + +template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_fwht_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + p.src_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); + p.dst_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + GGML_UNUSED(src1); + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + +template size_t push_constant_size(const T &t) { + static_assert(std::is_class::value, "T must be a struct/class"); + GGML_UNUSED(t); + return sizeof(T); +} + +template size_t push_constant_size(const std::vector &t) { + GGML_UNUSED(t); + return sizeof(T) * t.size(); +} + +template size_t push_constant_size(const std::array &t) { + GGML_UNUSED(t); + return sizeof(T) * N; +} + +template const T *push_constant_data(const T &t) { + static_assert(std::is_class::value, "T must be a struct/class"); + return &t; +} + +template const T *push_constant_data(const std::vector &t) { + return t.data(); +} + +template const T *push_constant_data(const std::array &t) { + return t.data(); +} + +template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + p.misalign_offsets = (a_offset << 16) | d_offset; + + GGML_UNUSED(src1); + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + +template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_glu_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); + const uint32_t b_offset = src1 ? get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type) : a_offset; + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + GGML_ASSERT(a_offset < (1u << 8)); + GGML_ASSERT(b_offset < (1u << 8)); + GGML_ASSERT(d_offset < (1u << 8)); + + p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset; + + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + +template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + p.misalign_offsets = (a_offset << 16) | d_offset; + + GGML_UNUSED(src1); + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + +template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_pad_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + p.misalign_offsets = (a_offset << 16) | d_offset; + + GGML_UNUSED(src1); + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + +template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_im2col_3d_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + const uint32_t a_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + p.misalign_offsets = (a_offset << 16) | d_offset; + + GGML_UNUSED(src0); + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + +template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); + const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0)); + + p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset; + + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + +template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + p.a_offset = a_offset; + p.d_offset = d_offset; + + GGML_UNUSED(src1); + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + +template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_rope_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + p.a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); + p.d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + GGML_UNUSED(src1); + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-shaders.cpp b/ggml/src/ggml-vulkan/ggml-vulkan-shaders.cpp new file mode 100644 index 0000000000..5be2eccd1e --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-shaders.cpp @@ -0,0 +1,2704 @@ +#include "ggml-vulkan-common.h" + +static bool ggml_vk_strip_decode_vector(const uint32_t * code, size_t word_count, std::vector & out) { + static const char kDecodeVectorExt[] = "SPV_NV_cooperative_matrix_decode_vector"; + + if (word_count < 5) { + return false; + } + + bool uses_decode_vector = false; + for (size_t pos = 5; pos < word_count; ) { + uint32_t word = code[pos]; + uint32_t wc = word >> spv::WordCountShift; + uint32_t op = word & spv::OpCodeMask; + GGML_ASSERT(wc > 0 && pos + wc <= word_count); + if (op == spv::OpExtension && wc >= 2) { + const char * s = reinterpret_cast(&code[pos + 1]); + if (strcmp(s, kDecodeVectorExt) == 0) { + uses_decode_vector = true; + break; + } + } + pos += wc; + } + + if (!uses_decode_vector) { + return false; + } + + VK_LOG_DEBUG("ggml_vk_strip_decode_vector: stripping SPV_NV_cooperative_matrix_decode_vector"); + + // Bulk-copy unchanged runs and only break the run when an instruction needs to + // be dropped or patched. Use reserve + insert/push_back so the destination buffer + // is touched exactly once (no zero-initialization pass from resize()). + out.clear(); + out.reserve(word_count); + + size_t run_start = 0; + auto flush_run = [&](size_t up_to) { + if (up_to > run_start) { + out.insert(out.end(), code + run_start, code + up_to); + } + }; + + for (size_t pos = 5; pos < word_count; ) { + uint32_t word = code[pos]; + uint32_t wc = word >> spv::WordCountShift; + uint32_t op = word & spv::OpCodeMask; + GGML_ASSERT(wc > 0 && pos + wc <= word_count); + + if (op == spv::OpExtension && wc >= 2) { + const char * s = reinterpret_cast(&code[pos + 1]); + if (strcmp(s, kDecodeVectorExt) == 0) { + flush_run(pos); + pos += wc; + run_start = pos; + continue; + } + } + + if (op == spv::OpCapability && wc == 2 && code[pos + 1] == kSpvCapabilityCooperativeMatrixDecodeVectorNV) { + flush_run(pos); + pos += wc; + run_start = pos; + continue; + } + + if (op == kSpvOpCooperativeMatrixLoadTensorNV) { + // [opcode/wc][ResultType][Result][Pointer][Object][TensorLayout][MemOperand mask][mem extras...][TA mask][ta extras...] + GGML_ASSERT(wc >= 8); + + uint32_t mem_mask = code[pos + 6]; + size_t cur = pos + 7; + // Each of these MemoryAccess bits (when set) carries one trailing operand. + cur += (mem_mask & 0x2) ? 1 : 0; // Aligned + cur += (mem_mask & 0x8) ? 1 : 0; // MakePointerAvailable + cur += (mem_mask & 0x10) ? 1 : 0; // MakePointerVisible + cur += (mem_mask & 0x10000) ? 1 : 0; // AliasScopeINTELMask + cur += (mem_mask & 0x20000) ? 1 : 0; // NoAliasINTELMask + GGML_ASSERT(cur < pos + wc); + + uint32_t ta_mask = code[cur]; + if ((ta_mask & kSpvTensorAddressingDecodeVectorFuncBit) == 0) { + pos += wc; + continue; // leave instruction inside the current unchanged run + } + + flush_run(pos); + + // Append unchanged prefix of the instruction (header through the mem-extras). + size_t inst_start = out.size(); + size_t pre_n = cur - pos; + out.insert(out.end(), code + pos, code + pos + pre_n); + + // Emit TA mask with the DecodeVectorFunc bit cleared. + out.push_back(ta_mask & ~kSpvTensorAddressingDecodeVectorFuncBit); + + // TA extras: TensorView (0x1) and DecodeFunc (0x2) are kept verbatim; + // DecodeVectorFunc (0x4) is dropped along with its trailing id operand. + size_t keep_ta_extras = ((ta_mask & 0x1) ? 1 : 0) + ((ta_mask & 0x2) ? 1 : 0); + if (keep_ta_extras) { + out.insert(out.end(), code + cur + 1, code + cur + 1 + keep_ta_extras); + } + + GGML_ASSERT(wc == pre_n + 1 + keep_ta_extras + 1); + + // Patch the instruction header with the new (one-shorter) word count. + uint32_t new_wc = wc - 1; + out[inst_start] = (new_wc << spv::WordCountShift) | op; + + pos += wc; + run_start = pos; + continue; + } + + pos += wc; + } + + flush_run(word_count); + return true; +} + +static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, size_t spv_size, const void* spv_data, const std::string entrypoint, + uint32_t parameter_count, std::array wg_denoms, std::vector specialization_constants, + bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) { + VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << pipeline->name << ", " << entrypoint << ", " << parameter_count << + ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << + disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")"); + GGML_ASSERT(parameter_count > 0); + GGML_ASSERT(parameter_count <= MAX_PARAMETER_COUNT); + GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT + + vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast(spv_data)); + + // Patch SPIR-V to enable RTE rounding for FP16, avoiding the need for + // separate shader variants compiled with -DRTE16. + std::vector spirv; + if (device->float_controls_rte_fp16) { + const uint32_t* spv_words = reinterpret_cast(spv_data); + size_t word_count = spv_size / sizeof(uint32_t); + spirv.assign(spv_words, spv_words + word_count); + + // Find insertion points respecting SPIR-V layout order: + // Header(5) -> OpCapability -> OpExtension -> ... -> OpEntryPoint -> OpExecutionMode -> ... + size_t pos = 5; // skip header + size_t cap_insert_pos = pos; + size_t ext_insert_pos = pos; + size_t exec_insert_pos = pos; + uint32_t entry_point_id = 0; + + while (pos < spirv.size()) { + uint32_t opcode = spirv[pos] & spv::OpCodeMask; + uint32_t len = spirv[pos] >> spv::WordCountShift; + if (len == 0) break; + + if (opcode == spv::OpCapability) { + cap_insert_pos = pos + len; + ext_insert_pos = pos + len; + } else if (opcode == spv::OpExtension) { + ext_insert_pos = pos + len; + } else if (opcode == spv::OpEntryPoint) { + entry_point_id = spirv[pos + 2]; + exec_insert_pos = pos + len; + } else if (opcode == spv::OpExecutionMode || opcode == spv::OpExecutionModeId) { + exec_insert_pos = pos + len; + } else if (entry_point_id != 0) { + break; + } + + pos += len; + } + + // Insert from latest position first so earlier indices stay valid. + + // OpExecutionMode %entrypoint RoundingModeRTE 16 + uint32_t exec_mode[] = { (4u << spv::WordCountShift) | spv::OpExecutionMode, entry_point_id, spv::ExecutionModeRoundingModeRTE, 16 }; + spirv.insert(spirv.begin() + exec_insert_pos, std::begin(exec_mode), std::end(exec_mode)); + + // OpExtension "SPV_KHR_float_controls" + const char ext_str[] = "SPV_KHR_float_controls"; + size_t ext_str_words = CEIL_DIV(sizeof(ext_str), sizeof(uint32_t)); + std::vector extension(1 + ext_str_words, 0); + extension[0] = (uint32_t)((1 + ext_str_words) << spv::WordCountShift) | spv::OpExtension; + memcpy(&extension[1], ext_str, sizeof(ext_str)); + spirv.insert(spirv.begin() + ext_insert_pos, extension.begin(), extension.end()); + + // OpCapability RoundingModeRTE + uint32_t capability[] = { (2u << spv::WordCountShift) | spv::OpCapability, spv::CapabilityRoundingModeRTE }; + spirv.insert(spirv.begin() + cap_insert_pos, std::begin(capability), std::end(capability)); + + shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data()); + } + +#if defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT) + if (device->coopmat2 && !device->coopmat2_decode_vector) { + const uint32_t * src = spirv.empty() ? reinterpret_cast(spv_data) : spirv.data(); + size_t src_n = spirv.empty() ? spv_size / sizeof(uint32_t) : spirv.size(); + std::vector stripped; + if (ggml_vk_strip_decode_vector(src, src_n, stripped)) { + spirv = std::move(stripped); + shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data()); + } + } +#endif + + pipeline->shader_module = device->device.createShaderModule(shader_module_create_info); + + vk::PushConstantRange pcr( + vk::ShaderStageFlagBits::eCompute, + 0, + pipeline->push_constant_size + ); + + vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), device->dsl, pcr); + pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info); + + std::vector specialization_entries(specialization_constants.size()); + + for (size_t i = 0; i < specialization_constants.size(); i++) { + specialization_entries[i].constantID = i; + specialization_entries[i].offset = i * sizeof(uint32_t); + specialization_entries[i].size = sizeof(uint32_t); + } + + vk::SpecializationInfo specialization_info( + specialization_entries.size(), + specialization_entries.data(), + specialization_constants.size() * sizeof(uint32_t), + specialization_constants.data() + ); + + vk::PipelineShaderStageCreateFlags pipeline_shader_stage_create_flags{}; + + if (device->subgroup_require_full_support && require_full_subgroups) { + pipeline_shader_stage_create_flags |= vk::PipelineShaderStageCreateFlagBits::eRequireFullSubgroupsEXT; + } + + vk::PipelineShaderStageCreateInfo pipeline_shader_create_info( + pipeline_shader_stage_create_flags, + vk::ShaderStageFlagBits::eCompute, + pipeline->shader_module, + entrypoint.c_str(), + &specialization_info); + + vk::PipelineShaderStageRequiredSubgroupSizeCreateInfoEXT pipeline_shader_stage_required_subgroup_size_create_info; + pipeline_shader_stage_required_subgroup_size_create_info.requiredSubgroupSize = required_subgroup_size; + if (device->subgroup_size_control && required_subgroup_size > 0) { + GGML_ASSERT(device->subgroup_min_size <= required_subgroup_size && required_subgroup_size <= device->subgroup_max_size); + pipeline_shader_create_info.setPNext(&pipeline_shader_stage_required_subgroup_size_create_info); + } + + vk::ComputePipelineCreateInfo compute_pipeline_create_info( + device->pipeline_executable_properties_support ? + vk::PipelineCreateFlagBits::eCaptureStatisticsKHR : + vk::PipelineCreateFlags{}, + pipeline_shader_create_info, + pipeline->layout); + + vk::PipelineRobustnessCreateInfoEXT rci; + + if (device->pipeline_robustness && disable_robustness) { + rci.storageBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled; + rci.uniformBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled; + compute_pipeline_create_info.setPNext(&rci); + } + +#if defined(VK_EXT_shader_64bit_indexing) + vk::PipelineCreateFlags2CreateInfo pipelineFlags2CreateInfo; + if (pipeline->is_64b_indexing) + { + pipelineFlags2CreateInfo.flags = vk::PipelineCreateFlagBits2::e64BitIndexingEXT; + if (device->pipeline_executable_properties_support) { + pipelineFlags2CreateInfo.flags |= vk::PipelineCreateFlagBits2::eCaptureStatisticsKHR; + } + pipelineFlags2CreateInfo.setPNext(compute_pipeline_create_info.pNext); + compute_pipeline_create_info.setPNext(&pipelineFlags2CreateInfo); + } +#endif + + try { + pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; + } catch (const vk::SystemError& e) { + std::cerr << "ggml_vulkan: Compute pipeline creation failed for " << pipeline->name << std::endl; + std::cerr << "ggml_vulkan: " << e.what() << std::endl; + throw e; + } + + if (vk_instance.debug_utils_support) { + vk::DebugUtilsObjectNameInfoEXT duoni; + duoni.objectType = vk::ObjectType::ePipeline; + duoni.pObjectName = pipeline->name.c_str(); + duoni.objectHandle = /*reinterpret_cast*/(uint64_t)(static_cast(pipeline->pipeline)); + vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast(duoni)); + } + + if (device->pipeline_executable_properties_support) { + vk::PipelineExecutableInfoKHR executableInfo; + executableInfo.pipeline = pipeline->pipeline; + + auto statistics = device->device.getPipelineExecutableStatisticsKHR(executableInfo); + + bool print_stats = !vk_pipeline_stats_filter.empty() && + pipeline->name.find(vk_pipeline_stats_filter) != std::string::npos; + if (print_stats) { + std::cerr << "ggml_vulkan: pipeline stats for " << pipeline->name << ":" << std::endl; + } + + for (auto & s : statistics) { + if (print_stats) { + std::cerr << "ggml_vulkan: " << s.name.data() << ": "; + switch (s.format) { + case vk::PipelineExecutableStatisticFormatKHR::eBool32: + std::cerr << (s.value.b32 ? "true" : "false"); + break; + case vk::PipelineExecutableStatisticFormatKHR::eInt64: + std::cerr << s.value.i64; + break; + case vk::PipelineExecutableStatisticFormatKHR::eUint64: + std::cerr << s.value.u64; + break; + case vk::PipelineExecutableStatisticFormatKHR::eFloat64: + std::cerr << s.value.f64; + break; + } + std::cerr << std::endl; + } + // "Register Count" is reported by NVIDIA drivers. + if (strcmp(s.name, "Register Count") == 0) { + VK_LOG_DEBUG(pipeline->name << " " << s.name << ": " << s.value.u64 << " registers"); + pipeline->register_count = (uint32_t)s.value.u64; + } + } + } + + { + std::lock_guard guard(device->compile_mutex); + device->all_pipelines.push_back(pipeline); + pipeline->compiled = true; + pipeline->compile_pending = false; + } + device->compile_cv.notify_all(); +} + +void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) { + VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")"); + device.destroyPipelineLayout(pipeline->layout); + + device.destroyShaderModule(pipeline->shader_module); + + device.destroyPipeline(pipeline->pipeline); +} + +static vk_fa_tuning_params get_fa_tuning_params_scalar(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc) { + + vk_fa_tuning_params result{}; + result.path = FA_SCALAR; + + if (device->vendor_id == VK_VENDOR_ID_INTEL) { + // Disable subgroup use due to performance issues when enforcing subgroup sizes + result.subgroup_size = 32; + result.disable_subgroups = true; + } else if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN) { + result.subgroup_size = n_rows < 4 ? 32 : device->subgroup_size; + } else { + result.subgroup_size = device->subgroup_size; + } + + // Row split splits the workgroup so that synchronization only has to happen within subgroups, which avoids barriers + uint32_t row_split_max_hsk = 64; + if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN && !device->uma) { + row_split_max_hsk = n_rows <= 8 ? 64 : 128; + } + result.row_split = (n_rows < 4 || hsk <= row_split_max_hsk) ? 1 : 4; + + if (result.subgroup_size > 32 && (n_rows < 4 || hsk < (result.row_split == 1 ? 128 : 64))) { + result.workgroup_size = result.subgroup_size * 2; + } else { + result.workgroup_size = result.subgroup_size * 4; + } + + const uint32_t D = hsk | hsv; + + const bool reduce_block_rows = D & 8 || n_kv < 1024 || device->vendor_id == VK_VENDOR_ID_INTEL; + + if (n_rows == 1) { + result.block_rows = 1; + result.block_cols = 64; + } else { + // row_split 1 means higher register use per row, so block size has to be adjusted + if (result.row_split == 1) { + result.block_rows = n_rows == 2 ? 2 : ((n_rows <= 4 || reduce_block_rows) ? 4 : 8); + } else { + result.block_rows = n_rows <= 4 ? 4 : ((n_rows <= 8 || reduce_block_rows) ? 8 : 16); + } + + result.block_cols = (D & 8) ? 64 : 32; + } + + const uint32_t D_lsb = D ^ (D & (D-1)); // extract lowest set bit + + result.d_split = std::min(std::min(result.subgroup_size, 8u), D_lsb / 4); + + result.shmem_staging = (device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256 && hsv < 256) ? 1 : 0; + + if (!reduce_block_rows && !ggml_vk_flash_attn_scalar_shmem_support(device, result, hsk, hsv, f32acc, k_type, v_type)) { + result.block_rows /= 2; + } + + // On AMD RDNA, for small head sizes and big batch size the shader uses few registers, so too many subgroups get scheduled + // at once and end up thrashing the cache. Fix this by setting a large (unused) shmem buffer that reduces occupancy. + // This targets an occupancy of 4 subgroups per SIMD. + if (device->vendor_id == VK_VENDOR_ID_AMD && device->properties.limits.maxComputeSharedMemorySize == 65536) { + if (device->architecture != AMD_GCN && n_rows >= 64 && hsk <= 128) { + // 30kb target for hsk > 64, 26kb for <= 64 due to smaller workgroup size + // Values are guessed, tested on RDNA2 + result.limit_occupancy_shmem = (hsk <= 64 ? 26 : 30) * 1024 / 4 / 4; + } else if (device->architecture == AMD_GCN && n_rows <= 8 && hsk >= 256) { + // Same thing for GCN, with an occupancy target of 2 subgroups per SIMD. + // Here low-batch FA with large head size is affected. + // n_rows < 4 switch because workgroup size switches from 128 to 256 there. + result.limit_occupancy_shmem = (n_rows < 4 ? 14 : 26) * 1024 / 4 / 4; + } + } + + return result; +} + +static vk_fa_tuning_params get_fa_tuning_params_coopmat1(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc) { + GGML_UNUSED(n_rows); + GGML_UNUSED(n_kv); + GGML_UNUSED(k_type); + GGML_UNUSED(v_type); + GGML_UNUSED(f32acc); + + vk_fa_tuning_params result{}; + result.path = FA_COOPMAT1; + + const uint32_t D = hsk | hsv; + + const uint32_t coopmat_block_rows = 16; + const uint32_t coopmat_block_cols = 16; + + const uint32_t num_subgroups = 4; + + result.block_rows = coopmat_block_rows; + result.block_cols = coopmat_block_cols * num_subgroups; + result.row_split = num_subgroups; + result.subgroup_size = device->subgroup_size; + result.workgroup_size = num_subgroups * result.subgroup_size; + + const uint32_t D_lsb = D ^ (D & (D-1)); // extract lowest set bit + result.d_split = std::min(std::min(result.subgroup_size, 8u), D_lsb / 4); + + result.shmem_staging = (device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256 && hsv < 256) ? 1 : 0; + + return result; +} + +static vk_fa_tuning_params get_fa_tuning_params_coopmat2(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc) { + GGML_UNUSED(n_kv); + GGML_UNUSED(f32acc); + + vk_fa_tuning_params result{}; + result.path = FA_COOPMAT2; + + const uint32_t D = hsk | hsv; + + const bool small_rows = n_rows < 32; + + if (small_rows) { + result.block_rows = 32; + result.block_cols = 32; + } else if (ggml_is_quantized(k_type) || ggml_is_quantized(v_type) || hsk >= 256 || hsv >= 256) { + result.block_rows = (hsk >= 512 || hsv >= 512) ? 32 : 64; + result.block_cols = 32; + } else { + result.block_rows = 64; + result.block_cols = 64; + } + + result.subgroup_size = device->subgroup_size; + result.workgroup_size = (small_rows && (D % 32) == 0) ? 256 : 128; + + return result; +} + +vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc) { + FaCodePath path = device->coopmat2 ? FA_COOPMAT2 : + device->coopmat1_fa_support ? FA_COOPMAT1 : FA_SCALAR; + + if (path == FA_COOPMAT2 && k_type == GGML_TYPE_BF16 && !device->coopmat2_bf16_support) { + path = FA_COOPMAT1; + } + if (path == FA_COOPMAT1 && k_type == GGML_TYPE_BF16 && !device->coopmat_bf16_support) { + path = FA_SCALAR; + } + + if (path == FA_COOPMAT1 && device->architecture == vk_device_architecture::NVIDIA_TURING) { + // Nvidia compiler bug, see https://github.com/ggml-org/llama.cpp/pull/19075#issuecomment-3820716090 + path = FA_SCALAR; + } + + if (path == FA_COOPMAT1) { + bool shape_ok = (f32acc && device->coopmat_support_16x16x16_f32acc) || + (!f32acc && device->coopmat_support_16x16x16_f16acc); + const vk_fa_tuning_params params = get_fa_tuning_params_coopmat1(device, hsk, hsv, n_rows, n_kv, k_type, v_type, f32acc); + bool shmem_ok = ggml_vk_flash_attn_coopmat_shmem_support(device, params, hsk, hsv, f32acc, k_type); + + if (!shape_ok || !shmem_ok) { + path = FA_SCALAR; + } + } + + // scalar is faster than coopmat when N==1 + if (n_rows == 1 && (path == FA_COOPMAT1 || path == FA_COOPMAT2)) { + path = FA_SCALAR; + } + + // Q1_0 K/V is only implemented on coopmat2 (flash_attn_cm2); there is no scalar FA shader for it. + if ((k_type == GGML_TYPE_Q1_0 || v_type == GGML_TYPE_Q1_0) && device->coopmat2) { + path = FA_COOPMAT2; + } + + switch (path) { + case FA_SCALAR: + return get_fa_tuning_params_scalar(device, hsk, hsv, n_rows, n_kv, k_type, v_type, f32acc); + case FA_COOPMAT1: + return get_fa_tuning_params_coopmat1(device, hsk, hsv, n_rows, n_kv, k_type, v_type, f32acc); + case FA_COOPMAT2: + return get_fa_tuning_params_coopmat2(device, hsk, hsv, n_rows, n_kv, k_type, v_type, f32acc); + default: + throw std::runtime_error("unsupported FaCodePath"); + } +} + +vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc, + bool use_mask, bool use_mask_opt, bool use_logit_softcap, ggml_type k_type, ggml_type v_type) { + const bool old_amd_windows = device->vendor_id == VK_VENDOR_ID_AMD && device->driver_id == vk::DriverId::eAmdProprietary && + (device->architecture == AMD_GCN || device->architecture == AMD_RDNA1 || device->architecture == AMD_RDNA2); + + uint32_t flags = (use_mask_opt ? 1 : 0) | + (use_mask ? 2 : 0) | + (use_logit_softcap ? 4 : 0) | + (old_amd_windows ? 8 : 0); + + const uint32_t subgroup_size = params.disable_subgroups ? 0 : params.subgroup_size; + + return vk_fa_pipeline_state{hsk, hsv, params.block_rows, params.block_cols, params.d_split, params.row_split, params.shmem_staging, params.path, params.workgroup_size, subgroup_size, aligned, f32acc, flags, params.limit_occupancy_shmem, k_type, v_type}; +} + +static std::vector get_fa_spec_constants(const vk_fa_pipeline_state& state) { + const auto fa_block_bytes = [](ggml_type t) -> uint32_t { + if (t == GGML_TYPE_F32) return 16u; + return (uint32_t) ggml_type_size(t); + }; + return { + /* 0 WorkGroupSize */ state.workgroup_size, + /* 1 Br */ state.Br, + /* 2 Bc */ state.Bc, + /* 3 HSK */ state.HSK, + /* 4 HSV */ state.HSV, + /* 5 Clamp */ static_cast(!state.aligned), + /* 6 D_split */ state.D_split, + /* 7 row_split */ state.row_split, + /* 8 SubGroupSize */ state.subgroup_size, + /* 9 SHMEM_STAGING */ state.shmem_staging ? 1u : 0u, + /*10 Flags */ state.flags, + /*11 LIMIT_OCCUPANCY_SHMEM */ state.limit_occupancy_shmem, + /*12 FaTypeK */ static_cast(state.k_type), + /*13 FaTypeV */ static_cast(state.v_type), + /*14 FaBlockBytesK */ fa_block_bytes(state.k_type), + /*15 FaBlockBytesV */ fa_block_bytes(state.v_type), + }; +} + +static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector& warptile, bool mul_mat_id, ggml_type src0_type) { + + uint32_t lut_size = 0; + switch (src0_type) { + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + // Regular matmul uses the compact uint16_t IQ1 grid; the expanded + // uint32_t grid is only enabled for the q8_1/int-dot vector path. + lut_size = 2*2048; + break; + case GGML_TYPE_IQ2_XXS: + lut_size = 8*256; + break; + case GGML_TYPE_IQ2_XS: + lut_size = 8*512; + break; + case GGML_TYPE_IQ2_S: + lut_size = 8*1024; + break; + case GGML_TYPE_IQ3_XXS: + lut_size = 4*256; + break; + case GGML_TYPE_IQ3_S: + lut_size = 4*512; + break; + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_MXFP4: + lut_size = 4*16; + break; + case GGML_TYPE_NVFP4: + // Same kvalues budget as MXFP4 plus ue4m3_fp32_lut[128] (types.glsl, DATA_A_NVFP4). + lut_size = 4*16 + 128u * (uint32_t)sizeof(float); + break; + default: + break; + } + + // Needs to be kept up to date on shader changes + const uint32_t bank_conflict_offset = device->coopmat_support ? 8 : 1; + const uint32_t type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float); + const uint32_t warps = warptile[0] / warptile[10]; + + const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size; + const uint32_t mmid_row_ids = mul_mat_id ? (warptile[2] * 2 * sizeof(uint16_t)) : 0; + const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0; + const uint32_t ballots_sh = mul_mat_id ? (warps * 4 * sizeof(uint32_t)) : 0; + + const uint32_t total_size = load_bufs + mmid_row_ids + coopmat_stage + lut_size + ballots_sh; + const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; + + VK_LOG_DEBUG("ggml_vk_matmul_shmem_support(warptile=(" << warptile[0] << "," << warptile[1] << "," << warptile[2] << "), " + "mul_mat_id=" << mul_mat_id << ", src0_type=" << ggml_type_name(src0_type) << ", supported=" << supported); + + return supported; +} + +static bool ggml_vk_matmul_int_shmem_support(const vk_device& device, const std::vector& warptile, bool mul_mat_id, ggml_type src0_type) { + + // FLOAT_TYPE in the shader is float16_t with fp16 support, otherwise float. + const uint32_t fp_size = device->fp16 ? 2u : 4u; + const uint32_t fp_align = fp_size; + const uint32_t fp2_size = 2u * fp_size; + const uint32_t fp2_align = device->fp16 ? 4u : 8u; + + struct member { uint32_t size, align; }; + auto std430_size = [](std::initializer_list members) { + uint32_t off = 0, struct_align = 1; + for (const auto &m : members) { + off = (off + m.align - 1) & ~(m.align - 1); + off += m.size; + struct_align = std::max(struct_align, m.align); + } + return (off + struct_align - 1) & ~(struct_align - 1); + }; + + uint32_t block_a_size = 0; + switch (src0_type) { + case GGML_TYPE_Q4_0: block_a_size = std430_size({{16, 4}, {fp_size, fp_align}}); break; // qs[16/4] + dm + case GGML_TYPE_Q4_1: block_a_size = std430_size({{16, 4}, {fp2_size, fp2_align}}); break; // qs[16/4] + dm(vec2) + case GGML_TYPE_Q5_0: block_a_size = std430_size({{16, 4}, {4, 4}, {fp_size, fp_align}}); break; // qs[16/4] + qh + dm + case GGML_TYPE_Q5_1: block_a_size = std430_size({{16, 4}, {4, 4}, {fp2_size, fp2_align}}); break; // qs[16/4] + qh + dm(vec2) + case GGML_TYPE_Q8_0: block_a_size = std430_size({{32, 4}, {fp_size, fp_align}}); break; // qs[8] + dm + case GGML_TYPE_MXFP4: block_a_size = std430_size({{32, 4}, {fp_size, fp_align}}); break; // qs[8] + d + case GGML_TYPE_Q2_K: block_a_size = std430_size({{ 8, 4}, {2, 2}, {fp2_size, fp2_align}}); break; // qs[2] + scales(u8vec2) + dm(vec2) + case GGML_TYPE_Q3_K: block_a_size = std430_size({{16, 4}, {fp2_size, fp2_align}}); break; // qs[4] + d_scales(vec2) + case GGML_TYPE_Q4_K: block_a_size = std430_size({{16, 4}, {fp2_size, fp2_align}}); break; // qs[4] + dm(vec2) + case GGML_TYPE_Q5_K: block_a_size = std430_size({{32, 4}, {fp2_size, fp2_align}}); break; // qs[8] + dm(vec2) + case GGML_TYPE_Q6_K: block_a_size = std430_size({{32, 4}, {fp2_size, fp2_align}}); break; // qs[8] + d_scales(vec2) + default: + return false; + } + + // block_b_cache: { int32_t qs[8]; FLOAT_TYPEV2 ds; } + const uint32_t block_b_size = std430_size({{32, 4}, {fp2_size, fp2_align}}); + + const uint32_t BM = warptile[1]; + const uint32_t BN = warptile[2]; + // mul_mmq.comp: BK_STEP=1 for MUL_MAT_ID, 4 otherwise. + const uint32_t BK_STEP = mul_mat_id ? 1u : 4u; + + const uint32_t buf_a_size = BM * BK_STEP * block_a_size; + const uint32_t buf_b_size = BN * BK_STEP * block_b_size; + const uint32_t mmid_row_ids = mul_mat_id ? (BN * 2u * (uint32_t)sizeof(uint16_t)) : 0u; + + const uint32_t warps = warptile[0] / warptile[10]; + const uint32_t ballots_sh = mul_mat_id ? (warps * 4u * (uint32_t)sizeof(uint32_t)) : 0u; + + const uint32_t total_size = buf_a_size + buf_b_size + mmid_row_ids + ballots_sh; + const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; + + VK_LOG_DEBUG("ggml_vk_matmul_int_shmem_support(warptile=(" << warptile[0] << "," << warptile[1] << "," << warptile[2] << "), " + "mul_mat_id=" << mul_mat_id << ", src0_type=" << ggml_type_name(src0_type) << ", total=" << total_size << ", supported=" << supported); + + return supported; +} + +static const std::unordered_map rdna1_pipelines = { + {"soft_max", 64}, {"im2col", 64}, + {"argmax", 64}, {"mul_mat_vec", 64}, + {"mul_mat_vec_f16", 32}, {"mul_mat_vec_f32_f16", 32} +}; + +static const std::unordered_map rdna2_pipelines = { + {"soft_max", 64}, {"im2col", 64}, +}; + +static std::vector gpu_pipeline_configs = { + { + vk_device_architecture::AMD_RDNA1, + { + rdna1_pipelines, + }, + RDNA_DEFAULT_SUBGROUP_SIZE + }, + { + vk_device_architecture::AMD_RDNA2, + { + rdna2_pipelines, + }, + RDNA_DEFAULT_SUBGROUP_SIZE + }, +}; + +uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch) { + for (const auto &config : gpu_pipeline_configs) { + if (config.arch == arch) { + auto pipIt = config.pipelines.find(pipeline_name); + if (pipIt != config.pipelines.end()) { + return pipIt->second; + } + std::vector> sorted_pipelines(config.pipelines.begin(), config.pipelines.end()); + std::sort(sorted_pipelines.begin(), sorted_pipelines.end(), + [](const auto &a, const auto &b) { return a.first.size() > b.first.size(); }); + for (const auto &entry : sorted_pipelines) { + if (pipeline_name.find(entry.first) != std::string::npos) { + return entry.second; + } + } + return config.default_subgroup_size; + } + } + return 0; // If no matching configuration is found +} + +static bool ggml_vk_fa_scalar_uses_mmq(const vk_device& device, ggml_type k_type) { +#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) + return device->integer_dot_product && device->subgroup_clustered && + (k_type == GGML_TYPE_Q4_0 || k_type == GGML_TYPE_Q4_1 || + k_type == GGML_TYPE_Q5_0 || k_type == GGML_TYPE_Q5_1 || + k_type == GGML_TYPE_Q8_0); +#else + GGML_UNUSED(device); + GGML_UNUSED(k_type); + return false; +#endif +} + +void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { + VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); + + // some shaders have a minimum subgroup size + const uint32_t subgroup_size_8 = std::max(device->subgroup_size, 8u); + const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u); + const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u); + + const uint32_t mul_mat_subgroup_size = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size; + const uint32_t mul_mat_subgroup_size_8 = std::max(mul_mat_subgroup_size, 8u); + const uint32_t mul_mat_subgroup_size_16 = std::max(mul_mat_subgroup_size, 16u); + const uint32_t mul_mat_subgroup_size_32 = std::max(mul_mat_subgroup_size, 32u); + + const bool subgroup_min_size_16 = (!device->subgroup_size_control && device->subgroup_size >= 16) || + (device->subgroup_size_control && device->subgroup_max_size >= 16); + + // mulmat + std::vector l_warptile, m_warptile, s_warptile, + l_warptile_id, m_warptile_id, s_warptile_id, + l_warptile_mmq, m_warptile_mmq, s_warptile_mmq, + l_warptile_mmq_int, m_warptile_mmq_int, s_warptile_mmq_int, + l_warptile_mmq_int_k, m_warptile_mmq_int_k, s_warptile_mmq_int_k, + l_warptile_mmq_k, m_warptile_mmq_k, s_warptile_mmq_k, + l_warptile_mmqid, m_warptile_mmqid, s_warptile_mmqid, + l_warptile_mmqid_int, m_warptile_mmqid_int, s_warptile_mmqid_int, + l_warptile_mmqid_int_k, m_warptile_mmqid_int_k, s_warptile_mmqid_int_k; + std::array l_wg_denoms, m_wg_denoms, s_wg_denoms, + l_mmq_wg_denoms, m_mmq_wg_denoms, s_mmq_wg_denoms, + l_mmq_wg_denoms_k, m_mmq_wg_denoms_k, s_mmq_wg_denoms_k, + l_mmqid_wg_denoms, m_mmqid_wg_denoms, s_mmqid_wg_denoms; + + uint32_t l_align, m_align, s_align; + + vk_pipeline wait_pipeline; + CompileTask claimed_task {}; + bool has_claimed_task = false; + + // The rest of the walk reads and writes shared device state, so hold the + // lock until we're done deciding what to compile. + std::unique_lock compile_lock(device->compile_mutex); + + if (device->coopmat2) { + // spec constants and tile sizes for non-quant matmul/matmul_id + l_warptile = { 256, 128, 256, 64, 1 }; + m_warptile = { 256, 128, 128, 64, 0 }; + s_warptile = { 128, 64, 64, 64, 0 }; + l_wg_denoms = {128, 256, 1 }; + m_wg_denoms = {128, 128, 1 }; + s_wg_denoms = { 64, 64, 1 }; + + // spec constants and tile sizes for quant matmul (non-Qi_K) + l_warptile_mmq = { 256, 128, 256, 64, 1 }; + m_warptile_mmq = { 256, 128, 128, 64, 1 }; + s_warptile_mmq = { 256, 32, 64, 128, 0 }; + l_mmq_wg_denoms = { 128, 256, 1 }; + m_mmq_wg_denoms = { 128, 128, 1 }; + s_mmq_wg_denoms = { 32, 64, 1 }; + + // spec constants and tile sizes for quant matmul (Qi_K) + l_warptile_mmq_k = { 256, 128, 256, 64, 1 }; + m_warptile_mmq_k = { 256, 128, 128, 64, 1 }; + s_warptile_mmq_k = { 256, 32, 64, 128, 0 }; + l_mmq_wg_denoms_k = { 128, 256, 1 }; + m_mmq_wg_denoms_k = { 128, 128, 1 }; + s_mmq_wg_denoms_k = { 32, 64, 1 }; + + // spec constants and tile sizes for quant matmul_id + const uint32_t mmqid_bk = device->coopmat2_decode_vector ? 64u : 32u; + l_warptile_mmqid = { 256, 128, 128, mmqid_bk, 1, device->subgroup_size }; + m_warptile_mmqid = { 256, 128, 64, mmqid_bk, 0, device->subgroup_size }; + s_warptile_mmqid = { 256, 128, 64, mmqid_bk, 0, device->subgroup_size }; + l_mmqid_wg_denoms = { 128, 128, 1 }; + m_mmqid_wg_denoms = { 128, 64, 1 }; + s_mmqid_wg_denoms = { 128, 64, 1 }; + + l_align = 128; + m_align = 64; + s_align = 32; + } else { + // Matrix cores require different warp group sizes + const uint32_t tm_l = device->coopmat_support ? device->coopmat_m : 4; + const uint32_t tm_m = device->coopmat_support ? device->coopmat_m : 4; + const uint32_t tm_s = device->coopmat_support ? device->coopmat_m : 2; + const uint32_t tn_l = device->coopmat_support ? device->coopmat_n : 4; + const uint32_t tn_m = device->coopmat_support ? device->coopmat_n : 2; + const uint32_t tn_s = device->coopmat_support ? device->coopmat_n : 2; + const uint32_t tk_l = device->coopmat_support ? device->coopmat_k : 1; + const uint32_t tk_m = device->coopmat_support ? device->coopmat_k : 1; + const uint32_t tk_s = device->coopmat_support ? device->coopmat_k : 1; + + const uint32_t s_warptile_wm = device->subgroup_size == 8 ? 8 : 32; + + l_warptile = { 128, 128, 128, 16, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 }; + m_warptile = { 128, 64, 64, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; + s_warptile = { subgroup_size_32, 32, 32, 16, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 }; + + l_warptile_mmq = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 }; + m_warptile_mmq = { 128, 64, 64, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; + s_warptile_mmq = { subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 }; + + // Integer MMQ has a smaller shared memory profile, but heavier register use + l_warptile_mmq_int = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, 4, 4, 1, subgroup_size_8 }; + m_warptile_mmq_int = { 128, 64, 64, 32, subgroup_size_8, 32, 2, 2, 2, 1, subgroup_size_8 }; + s_warptile_mmq_int = { subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, 2, 1, 1, subgroup_size_8 }; + + // K-quants use even more registers, mitigate by setting WMITER to 1 + l_warptile_mmq_int_k = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 1, 4, 4, 1, subgroup_size_8 }; + m_warptile_mmq_int_k = { 128, 64, 64, 32, subgroup_size_8, 32, 1, 2, 2, 1, subgroup_size_8 }; + s_warptile_mmq_int_k = { subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 1, 2, 1, 1, subgroup_size_8 }; + + l_warptile_id = { 128, 128, 128, 16, mul_mat_subgroup_size_16 * 2, 64, 2, tm_l, tn_l, tk_l, mul_mat_subgroup_size_16 }; + m_warptile_id = { 128, 64, 64, 16, mul_mat_subgroup_size_16, 32, 2, tm_m, tn_m, tk_m, mul_mat_subgroup_size_16 }; + s_warptile_id = { mul_mat_subgroup_size_16, 32, 32, 16, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, mul_mat_subgroup_size_16 }; + + l_warptile_mmqid = { 128, 128, 128, 32, mul_mat_subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, mul_mat_subgroup_size_8 }; + m_warptile_mmqid = { 128, 64, 64, 32, mul_mat_subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, mul_mat_subgroup_size_8 }; + s_warptile_mmqid = { mul_mat_subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, mul_mat_subgroup_size_8 }; + + l_warptile_mmqid_int = { 128, 128, 128, 32, mul_mat_subgroup_size_8 * 2, 64, 2, 4, 4, 1, mul_mat_subgroup_size_8 }; + m_warptile_mmqid_int = { 128, 64, 64, 32, mul_mat_subgroup_size_8, 32, 2, 2, 2, 1, mul_mat_subgroup_size_8 }; + s_warptile_mmqid_int = { mul_mat_subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, 2, 1, 1, mul_mat_subgroup_size_8 }; + + l_warptile_mmqid_int_k = { 128, 128, 128, 32, mul_mat_subgroup_size_16 * 2, 64, 1, 4, 4, 1, mul_mat_subgroup_size_16 }; + m_warptile_mmqid_int_k = { 128, 64, 64, 32, mul_mat_subgroup_size_16, 32, 1, 2, 2, 1, mul_mat_subgroup_size_16 }; + s_warptile_mmqid_int_k = { mul_mat_subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 1, 2, 1, 1, mul_mat_subgroup_size_16 }; + + // chip specific tuning + if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) { + m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 }; + m_warptile_mmqid = m_warptile_mmqid_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 }; + } else if (device->vendor_id == VK_VENDOR_ID_AMD && device->coopmat_support && device->driver_id != vk::DriverId::eAmdProprietary) { + // This is intentionally using tx_m values, slight performance increase + l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; + l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; + l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 }; + } else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) { + // Xe2/Xe3 with coopmat enabled - warptile performance tuning + l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; + l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; + } + + l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; + m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; + s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; + l_align = 128; + m_align = 64; + s_align = 32; + + for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) { + ggml_type t = (ggml_type)i; + // Disable medium and large matrix multiplication if not enough shared memory is available + // Check mmq warptiles as the largest configuration + // Throw an error if not enough for any matrix multiplication is available + if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, false, t)) { + std::cerr << "ggml_vulkan: Error: Shared memory size too small for matrix multiplication." << std::endl; + throw std::runtime_error("Shared memory size too small for matrix multiplication."); + } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, false, t)) { + device->mul_mat_m[i] = false; + device->mul_mat_l[i] = false; + } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, false, t)) { + device->mul_mat_l[i] = false; + } + + // Disable mul_mat_id if not enough shared memory is available + if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmqid, true, t)) { + device->mul_mat_id_s[i] = false; + device->mul_mat_id_m[i] = false; + device->mul_mat_id_l[i] = false; + } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmqid, true, t)) { + device->mul_mat_id_m[i] = false; + device->mul_mat_id_l[i] = false; + } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmqid, true, t)) { + device->mul_mat_id_l[i] = false; + } + + // The q8_1 mmq path has its own (larger) shmem layout, check it separately. + // K-quants use the _int_k warptiles, others use _int. + const bool is_k_quant = (t == GGML_TYPE_Q2_K || t == GGML_TYPE_Q3_K || + t == GGML_TYPE_Q4_K || t == GGML_TYPE_Q5_K || + t == GGML_TYPE_Q6_K); + const auto & s_int = is_k_quant ? s_warptile_mmq_int_k : s_warptile_mmq_int; + const auto & m_int = is_k_quant ? m_warptile_mmq_int_k : m_warptile_mmq_int; + const auto & l_int = is_k_quant ? l_warptile_mmq_int_k : l_warptile_mmq_int; + const auto & s_intid = is_k_quant ? s_warptile_mmqid_int_k : s_warptile_mmqid_int; + const auto & m_intid = is_k_quant ? m_warptile_mmqid_int_k : m_warptile_mmqid_int; + const auto & l_intid = is_k_quant ? l_warptile_mmqid_int_k : l_warptile_mmqid_int; + + if (!ggml_vk_matmul_int_shmem_support(device, s_int, false, t)) { + device->mul_mat_s_int[i] = false; + device->mul_mat_m_int[i] = false; + device->mul_mat_l_int[i] = false; + } else if (!ggml_vk_matmul_int_shmem_support(device, m_int, false, t)) { + device->mul_mat_m_int[i] = false; + device->mul_mat_l_int[i] = false; + } else if (!ggml_vk_matmul_int_shmem_support(device, l_int, false, t)) { + device->mul_mat_l_int[i] = false; + } + + if (!ggml_vk_matmul_int_shmem_support(device, s_intid, true, t)) { + device->mul_mat_id_s_int[i] = false; + device->mul_mat_id_m_int[i] = false; + device->mul_mat_id_l_int[i] = false; + } else if (!ggml_vk_matmul_int_shmem_support(device, m_intid, true, t)) { + device->mul_mat_id_m_int[i] = false; + device->mul_mat_id_l_int[i] = false; + } else if (!ggml_vk_matmul_int_shmem_support(device, l_intid, true, t)) { + device->mul_mat_id_l_int[i] = false; + } + } + } + + if (!device->pipeline_matmul_f32) { + device->pipeline_matmul_f32 = std::make_shared(); + } + if (!device->pipeline_matmul_f32_f16) { + device->pipeline_matmul_f32_f16 = std::make_shared(); + } + if (!device->pipeline_matmul_id_f32) { + device->pipeline_matmul_id_f32 = std::make_shared(); + } + if (!device->pipeline_matmul_bf16) { + device->pipeline_matmul_bf16 = std::make_shared(); + } + if (!device->pipeline_matmul_id_bf16) { + device->pipeline_matmul_id_bf16 = std::make_shared(); + } + + auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& base_pipeline, const char *name, size_t spv_size, const void* spv_data, const char *entrypoint, + uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, + uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) { + + if (!require_full_subgroups && required_subgroup_size == 0) { + required_subgroup_size = get_subgroup_size(name, device->architecture); + } + + vk_pipeline *ptr = &base_pipeline; + + int num_pipelines = 1; +#if defined(VK_EXT_shader_64bit_indexing) + if (device->shader_64b_indexing) { + num_pipelines = 2; + } +#endif + for (int i = 0; i < num_pipelines; ++i, ptr = &(*ptr)->next) { + vk_pipeline &pipeline = *ptr; + if (!pipeline) { + pipeline = std::make_shared(); + } + if (!pipeline->initialized) { + pipeline->name = name; + pipeline->parameter_count = parameter_count; + pipeline->push_constant_size = push_constant_size; + pipeline->wg_denoms = wg_denoms; + pipeline->align = align; + pipeline->initialized = true; +#if defined(VK_EXT_shader_64bit_indexing) + pipeline->is_64b_indexing = (i == 1); +#endif + } + + // We only care about the pipeline this call asked for; the rest + // (including the 64-bit indexing variant) are handled by their + // own request_descriptor_sets / load_shaders calls. + if (pipeline.get() != requested.get()) { + continue; + } + + if (pipeline->compiled) { + continue; + } + + wait_pipeline = pipeline; + + if (!pipeline->compile_pending) { + pipeline->compile_pending = true; + claimed_task.pipeline = pipeline; + claimed_task.spv_size = spv_size; + claimed_task.spv_data = spv_data; + claimed_task.entrypoint = entrypoint; + claimed_task.parameter_count = parameter_count; + claimed_task.wg_denoms = wg_denoms; + claimed_task.specialization_constants = specialization_constants; + claimed_task.disable_robustness = disable_robustness; + claimed_task.require_full_subgroups = require_full_subgroups; + claimed_task.required_subgroup_size = required_subgroup_size; + has_claimed_task = true; + } + } + }; + + auto const &ggml_vk_create_pipeline2 = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const char *entrypoint, + uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, + uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) { + return ggml_vk_create_pipeline(device, pipeline, name.c_str(), spv_size, spv_data, entrypoint, + parameter_count, push_constant_size, wg_denoms, specialization_constants, + align, disable_robustness, require_full_subgroups, required_subgroup_size); + }; + + // FA scalar has two SPIR-V modules (MMQ vs non-MMQ); FA cm1 has one. K/V + // quant type is selected at runtime via the FaTypeK / FaTypeV spec constants. + + for (auto &fa : device->pipeline_flash_attn_f32_f16) { + if (fa.first.path != FA_SCALAR) continue; + const uint32_t Br = fa.first.Br; + const uint32_t Bc = fa.first.Bc; + const bool aligned = fa.first.aligned; + const bool f32acc = fa.first.f32acc; + const uint32_t fa_sgs = fa.first.subgroup_size; + const bool fa_ds = fa.first.subgroup_size == 0; + + const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16; + const bool use_mmq = ggml_vk_fa_scalar_uses_mmq(device, fa.first.k_type); + const void * spv_data = nullptr; + size_t spv_size = 0; + const char *name = nullptr; + if (bf16_kv) { + spv_data = flash_attn_f32_f16_fp32_data; + spv_size = flash_attn_f32_f16_fp32_len; + name = aligned ? "flash_attn_f32_bf16_aligned" : "flash_attn_f32_bf16"; + } else if (use_mmq) { +#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) + if (device->fp16) { + if (f32acc) { spv_data = flash_attn_f32_f16_int8_data; spv_size = flash_attn_f32_f16_int8_len; } + else { spv_data = flash_attn_f32_f16_f16acc_int8_data; spv_size = flash_attn_f32_f16_f16acc_int8_len; } + } else { + spv_data = flash_attn_f32_f16_fp32_int8_data; + spv_size = flash_attn_f32_f16_fp32_int8_len; + } +#endif + name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16"; + } else { + if (device->fp16) { + if (device->dot2_f16) { + if (f32acc) { spv_data = flash_attn_f32_f16_dot2_data; spv_size = flash_attn_f32_f16_dot2_len; } + else { spv_data = flash_attn_f32_f16_dot2_f16acc_data; spv_size = flash_attn_f32_f16_dot2_f16acc_len; } + } else { + if (f32acc) { spv_data = flash_attn_f32_f16_data; spv_size = flash_attn_f32_f16_len; } + else { spv_data = flash_attn_f32_f16_f16acc_data; spv_size = flash_attn_f32_f16_f16acc_len; } + } + } else { + spv_data = flash_attn_f32_f16_fp32_data; + spv_size = flash_attn_f32_f16_fp32_len; + } + name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16"; + } + ggml_vk_create_pipeline(device, fa.second, name, spv_size, spv_data, "main", 7, + sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, + get_fa_spec_constants(fa.first), aligned ? Bc : 1, true, + !fa_ds, !fa_ds ? fa_sgs : 0); + } + +#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + if (device->coopmat1_fa_support) { + for (auto &fa : device->pipeline_flash_attn_f32_f16) { + if (fa.first.path != FA_COOPMAT1) continue; + const uint32_t Br = fa.first.Br; + const uint32_t Bc = fa.first.Bc; + const bool aligned = fa.first.aligned; + const bool f32acc = fa.first.f32acc; + const uint32_t fa_sgs = fa.first.subgroup_size; + const bool fa_ds = fa.first.subgroup_size == 0; + + const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16; + + const void * spv_data; + size_t spv_size; + const char *name; + if (bf16_kv) { +#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (!device->coopmat_bf16_support) continue; + spv_data = flash_attn_f32_f16_bf16_cm1_data; + spv_size = flash_attn_f32_f16_bf16_cm1_len; + name = aligned ? "flash_attn_f32_bf16_aligned_cm1" : "flash_attn_f32_bf16_cm1"; +#else + continue; +#endif + } else { + if (f32acc) { spv_data = flash_attn_f32_f16_cm1_data; spv_size = flash_attn_f32_f16_cm1_len; } + else { spv_data = flash_attn_f32_f16_f16acc_cm1_data; spv_size = flash_attn_f32_f16_f16acc_cm1_len; } + name = aligned ? "flash_attn_f32_f16_aligned_cm1" : "flash_attn_f32_f16_cm1"; + } + ggml_vk_create_pipeline(device, fa.second, name, spv_size, spv_data, "main", 7, + sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, + get_fa_spec_constants(fa.first), aligned ? Bc : 1, true, + !fa_ds, !fa_ds ? fa_sgs : 0); + } + } +#endif + +#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + if (device->coopmat2) { + for (auto &fa : device->pipeline_flash_attn_f32_f16) { + if (fa.first.path != FA_COOPMAT2) continue; + const uint32_t Br = fa.first.Br; + const uint32_t Bc = fa.first.Bc; + const bool aligned = fa.first.aligned; + const bool f32acc = fa.first.f32acc; + + const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16; + const void * spv_data; + size_t spv_size; + const char * name; + if (bf16_kv) { +#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (!device->coopmat2_bf16_support) continue; + spv_data = flash_attn_f32_f16_bf16_cm2_data; + spv_size = flash_attn_f32_f16_bf16_cm2_len; + name = aligned ? "flash_attn_f32_bf16_aligned_cm2" : "flash_attn_f32_bf16_cm2"; +#else + continue; +#endif + } else if (aligned) { + if (f32acc) { spv_data = flash_attn_f32_f16_cm2_data; spv_size = flash_attn_f32_f16_cm2_len; name = "flash_attn_f32_f16_aligned_f32acc_cm2"; } + else { spv_data = flash_attn_f32_f16_f16acc_cm2_data; spv_size = flash_attn_f32_f16_f16acc_cm2_len; name = "flash_attn_f32_f16_aligned_f16acc_cm2"; } + } else { + if (f32acc) { spv_data = flash_attn_f32_f16_cm2_data; spv_size = flash_attn_f32_f16_cm2_len; name = "flash_attn_f32_f16_f32acc_cm2"; } + else { spv_data = flash_attn_f32_f16_f16acc_cm2_data; spv_size = flash_attn_f32_f16_f16acc_cm2_len; name = "flash_attn_f32_f16_f16acc_cm2"; } + } + ggml_vk_create_pipeline(device, fa.second, name, spv_size, spv_data, "main", 7, + sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, + get_fa_spec_constants(fa.first), aligned ? Bc : 1, true, false, 0); + } + } +#endif + + const int mul_mat_id_param_count = 5; + +#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + if (device->coopmat2) { + + // Create 6 variants, {s,m,l}x{unaligned,aligned} +#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, true); \ + + // Create 2 variants, {f16,f32} accumulator +#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + + CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3) +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (device->coopmat_bf16_support) { + CREATE_MM(pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3) + } +#endif + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q1_0], matmul_q1_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0], matmul_q4_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1], matmul_q4_1_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0], matmul_q5_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_1], matmul_q5_1_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q8_0], matmul_q8_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q2_K], matmul_q2_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q3_K], matmul_q3_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_K], matmul_q4_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_K], matmul_q5_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K], matmul_q6_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ1_S], matmul_iq1_s_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ1_M], matmul_iq1_m_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XS], matmul_iq2_xs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_S], matmul_iq2_s_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_S], matmul_iq3_s_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_XS], matmul_iq4_xs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL], matmul_iq4_nl_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_MXFP4], matmul_mxfp4_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_NVFP4], matmul_nvfp4_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + + GGML_ASSERT(device->subgroup_ballot); + + CREATE_MM2(pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 5) +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (device->coopmat_bf16_support) { + CREATE_MM(pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 5) + } +#endif + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_subgroup_q1_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_subgroup_q5_1_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_subgroup_q8_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_subgroup_q2_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_subgroup_q3_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_subgroup_q4_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_subgroup_q5_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_subgroup_q6_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_subgroup_iq1_s_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_subgroup_iq1_m_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_subgroup_iq2_xxs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_subgroup_iq2_xs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_subgroup_iq2_s_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_subgroup_iq3_xxs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_subgroup_iq3_s_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_subgroup_iq4_xs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_subgroup_iq4_nl_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_subgroup_mxfp4_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4], matmul_id_subgroup_nvfp4_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) +#undef CREATE_MM +#undef CREATE_MM2 + } else +#endif // defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) +#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + if (device->coopmat_support) { + // Create 6 variants, {s,m,l}x{unaligned,aligned} +#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + if (device->mul_mat ## ID ## _l[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true); \ + if (device->mul_mat ## ID ## _m[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true); \ + if (device->mul_mat ## ID ## _s[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true); \ + if (device->mul_mat ## ID ## _l[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true); \ + if (device->mul_mat ## ID ## _m[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true); \ + if (device->mul_mat ## ID ## _s[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true); \ + + // Create 2 variants, {f16,f32} accumulator +#define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + if (device->coopmat_acc_f16_support) { \ + CREATE_MM(TYPE, PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + } \ + if (device->coopmat_acc_f32_support) { \ + CREATE_MM(TYPE, PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + } \ + + CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (device->coopmat_bf16_support) { + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ) + } +#endif + + if (device->coopmat_acc_f16_support) { + CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0], matmul_q1_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0], matmul_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1], matmul_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0], matmul_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1], matmul_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0], matmul_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + + CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K], matmul_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K], matmul_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K], matmul_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K], matmul_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K], matmul_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S], matmul_iq1_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M], matmul_iq1_m_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS], matmul_iq2_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S], matmul_iq2_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S], matmul_iq3_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS], matmul_iq4_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL], matmul_iq4_nl_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_MXFP4], matmul_mxfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_NVFP4], matmul_nvfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + } else { + CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0].f32acc, matmul_q1_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + + CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S].f32acc, matmul_iq1_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M].f32acc, matmul_iq1_m_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f32acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f32acc, matmul_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc, matmul_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_MXFP4].f32acc, matmul_mxfp4_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_NVFP4].f32acc, matmul_nvfp4_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + } + + GGML_ASSERT(device->subgroup_ballot); + + CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (device->coopmat_bf16_support) { + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + } +#endif + + CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_subgroup_q1_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_subgroup_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_subgroup_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_subgroup_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_subgroup_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_subgroup_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_subgroup_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_subgroup_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_subgroup_iq1_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_subgroup_iq1_m_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_subgroup_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_subgroup_iq2_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_subgroup_iq2_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_subgroup_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_subgroup_iq3_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_subgroup_iq4_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_subgroup_iq4_nl_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_subgroup_mxfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4], matmul_id_subgroup_nvfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); +#undef CREATE_MM2 +#undef CREATE_MM + } else +#endif // defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + if (device->fp16) { + // Create 6 variants, {s,m,l}x{unaligned,aligned} + // Selects dot2 SPIR-V variant at runtime when device->dot2_f16 is true +#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ + if (device->mul_mat ## ID ## _l[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _m[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _s[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _l[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _m[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _s[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + + // bf16 scalar path promotes to f32, no dot2 variant +#define CREATE_MM_NODOT2(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ + if (device->mul_mat ## ID ## _l[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _m[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _s[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _l[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _m[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _s[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + +#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ + if (device->mul_mat ## ID ## _l_int[TYPE]) { \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->l, #NAMELC "_l", NAMELC ## _len, NAMELC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + } \ + if (device->mul_mat ## ID ## _m_int[TYPE]) { \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->m, #NAMELC "_m", NAMELC ## _len, NAMELC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + } \ + if (device->mul_mat ## ID ## _s_int[TYPE]) { \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->s, #NAMELC "_s", NAMELC ## _len, NAMELC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + } \ + + // Create 2 variants, {f16,f32} accumulator +#define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ + CREATE_MM(TYPE, PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ + CREATE_MM(TYPE, PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ + + CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); + + CREATE_MM_NODOT2(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); + + CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0], matmul_q1_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0], matmul_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1], matmul_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0], matmul_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1], matmul_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0], matmul_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K], matmul_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K], matmul_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K], matmul_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K], matmul_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K], matmul_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S], matmul_iq1_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M], matmul_iq1_m_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS], matmul_iq2_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S], matmul_iq2_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S], matmul_iq3_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS], matmul_iq4_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL], matmul_iq4_nl_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_MXFP4], matmul_mxfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_NVFP4], matmul_nvfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + +#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) + if (device->integer_dot_product) { + CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0], matmul_q4_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); + CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1], matmul_q4_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); + CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0], matmul_q5_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); + CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1], matmul_q5_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); + CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0], matmul_q8_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); + + CREATE_MMQ(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_MXFP4], matmul_mxfp4_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); + + CREATE_MMQ(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q2_K], matmul_q2_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); + CREATE_MMQ(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q3_K], matmul_q3_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); + CREATE_MMQ(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_K], matmul_q4_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); + CREATE_MMQ(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_K], matmul_q5_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); + CREATE_MMQ(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q6_K], matmul_q6_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); + } +#endif + + if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) { + CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); + CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); + CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); + CREATE_MM_NODOT2(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); + CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_subgroup_q1_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_subgroup_q5_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_subgroup_q8_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_subgroup_q2_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_subgroup_q3_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_subgroup_q4_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_subgroup_q5_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_subgroup_q6_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_subgroup_iq1_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_subgroup_iq1_m_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_subgroup_iq2_xxs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_subgroup_iq2_xs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_subgroup_iq2_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_subgroup_iq3_xxs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_subgroup_iq3_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_subgroup_iq4_xs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_subgroup_iq4_nl_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_subgroup_mxfp4_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4], matmul_id_subgroup_nvfp4_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + +#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) + if (device->integer_dot_product) { + CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_1], matmul_id_subgroup_q5_1_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q8_0], matmul_id_subgroup_q8_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + + CREATE_MMQ(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_MXFP4], matmul_id_subgroup_mxfp4_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + + CREATE_MMQ(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q2_K], matmul_id_subgroup_q2_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); + CREATE_MMQ(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q3_K], matmul_id_subgroup_q3_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); + CREATE_MMQ(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_K], matmul_id_subgroup_q4_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); + CREATE_MMQ(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_K], matmul_id_subgroup_q5_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); + CREATE_MMQ(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q6_K], matmul_id_subgroup_q6_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); + } +#endif + } else { + CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM_NODOT2(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_q1_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_iq1_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_iq1_m_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_iq2_xxs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_iq2_xs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_iq2_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_iq3_xxs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_iq3_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_iq4_xs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_mxfp4_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4], matmul_id_nvfp4_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + +#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) + if (device->integer_dot_product) { + CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_0], matmul_id_q4_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_1], matmul_id_q4_1_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_0], matmul_id_q5_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_1], matmul_id_q5_1_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q8_0], matmul_id_q8_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + + CREATE_MMQ(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_MXFP4], matmul_id_mxfp4_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + + CREATE_MMQ(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q2_K], matmul_id_q2_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MMQ(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q3_K], matmul_id_q3_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MMQ(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_K], matmul_id_q4_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MMQ(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_K], matmul_id_q5_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MMQ(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q6_K], matmul_id_q6_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + } +#endif + } +#undef CREATE_MM2 +#undef CREATE_MMQ +#undef CREATE_MM +#undef CREATE_MM_NODOT2 + } else { + // Create 6 variants, {s,m,l}x{unaligned,aligned} +#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ + if (device->mul_mat ## ID ## _l[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _m[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _s[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _l[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _m[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + if (device->mul_mat ## ID ## _s[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + +#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + if (device->mul_mat ## ID ## _l_int[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC "_l", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _m_int[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC "_m", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _s_int[TYPE]) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC "_s", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ + + CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); + + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); + + CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0].f32acc, matmul_q1_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + + CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S].f32acc, matmul_iq1_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M].f32acc, matmul_iq1_m_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f32acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f32acc, matmul_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc, matmul_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_MXFP4].f32acc, matmul_mxfp4_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_NVFP4].f32acc, matmul_nvfp4_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + +#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) + if (device->integer_dot_product) { + CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); + CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); + CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); + CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); + CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); + + CREATE_MMQ(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); + CREATE_MMQ(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); + CREATE_MMQ(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); + CREATE_MMQ(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); + CREATE_MMQ(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); + } +#endif + + if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) { + CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); + CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_subgroup_f16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); + CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_subgroup_f16_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); + + CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0].f32acc, matmul_id_subgroup_q1_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_subgroup_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_subgroup_q4_1_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_subgroup_q5_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f32acc, matmul_id_subgroup_q5_1_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f32acc, matmul_id_subgroup_q8_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f32acc, matmul_id_subgroup_q2_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f32acc, matmul_id_subgroup_q3_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_subgroup_q4_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_subgroup_q5_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_subgroup_q6_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S].f32acc, matmul_id_subgroup_iq1_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M].f32acc, matmul_id_subgroup_iq1_m_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f32acc, matmul_id_subgroup_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f32acc, matmul_id_subgroup_iq2_xs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f32acc, matmul_id_subgroup_iq2_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f32acc, matmul_id_subgroup_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc, matmul_id_subgroup_iq3_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f32acc, matmul_id_subgroup_iq4_xs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_subgroup_iq4_nl_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f32acc, matmul_id_subgroup_mxfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4].f32acc, matmul_id_subgroup_nvfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + } else { + CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + + CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0].f32acc, matmul_id_q1_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f32acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f32acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f32acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f32acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S].f32acc, matmul_id_iq1_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M].f32acc, matmul_id_iq1_m_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f32acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f32acc, matmul_id_iq2_xs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f32acc, matmul_id_iq2_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f32acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f32acc, matmul_id_iq4_xs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f32acc, matmul_id_mxfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4].f32acc, matmul_id_nvfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + } + } + // reusing CREATE_MM from the fp32 path + if ((device->coopmat2 || device->coopmat_support) +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + && !device->coopmat_bf16_support +#endif + ) { + const uint32_t s_warptile_wm = device->subgroup_size == 8 ? 8 : 32; + + // use scalar tile sizes + l_warptile = { 128, 128, 128, 16, subgroup_size_8 * 2, 64, 2, 4, 4, 1, subgroup_size_8 }; + m_warptile = { 128, 64, 64, 16, subgroup_size_8, 32, 2, 4, 2, 1, subgroup_size_8 }; + s_warptile = { subgroup_size_32, 32, 32, 16, s_warptile_wm, 32, 2, 2, 2, 1, subgroup_size_8 }; + + l_wg_denoms = {128, 128, 1 }; + m_wg_denoms = { 64, 64, 1 }; + s_wg_denoms = { 32, 32, 1 }; + + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + } +#undef CREATE_MM + + // mul mat vec + + // the number of rows computed per shader depends on GPU model and quant + uint32_t rm_stdq = 1; + uint32_t rm_kq = 2; + uint32_t rm_stdq_int = 1; + uint32_t rm_kq_int = 1; + auto const &rm_iq_int = [](uint32_t i) { return i == 0 ? 8u : 4u; }; + if (device->vendor_id == VK_VENDOR_ID_AMD) { + if (device->architecture == AMD_GCN) { + rm_stdq = 2; + rm_kq = 4; + rm_stdq_int = 4; + } + } else if (device->vendor_id == VK_VENDOR_ID_INTEL) { + rm_stdq = 2; + rm_stdq_int = 2; + } + uint32_t rm_iq = 2 * rm_kq; + + const bool use_subgroups = device->subgroup_arithmetic && device->architecture != vk_device_architecture::AMD_GCN; + // Ensure a subgroup size >= 16 is available + const bool use_subgroups16 = use_subgroups && subgroup_min_size_16; + + const uint32_t subgroup_size = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control && device->subgroup_min_size <= 16 && device->subgroup_max_size >= 16) ? 16 : device->subgroup_size; + const uint32_t subgroup_size16 = std::max(subgroup_size, 16u); + + const uint32_t force_subgroup_size = use_subgroups ? subgroup_size : 0; + const uint32_t force_subgroup_size16 = use_subgroups16 ? subgroup_size16 : 0; + static constexpr uint32_t mul_mat_vec_num_bindings = 5; + static constexpr uint32_t mul_mat_vec_id_num_bindings = 6; + + for (uint32_t w = 0; w < DMMV_WG_SIZE_COUNT; ++w) { + const uint32_t wg_size_subgroup = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size : (subgroup_size * 4); + const uint32_t wg_size_subgroup16 = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size16 : (subgroup_size16 * 4); + + const shader_reduction_mode reduc = (use_subgroups && w == DMMV_WG_SIZE_SUBGROUP) ? SHADER_REDUCTION_MODE_SUBGROUP : + (use_subgroups && w == DMMV_WG_SIZE_LARGE) ? SHADER_REDUCTION_MODE_HYBRID : + SHADER_REDUCTION_MODE_SHMEM; + + const shader_reduction_mode reduc16 = (use_subgroups16 && w == DMMV_WG_SIZE_SUBGROUP) ? SHADER_REDUCTION_MODE_SUBGROUP : + (use_subgroups16 && w == DMMV_WG_SIZE_LARGE) ? SHADER_REDUCTION_MODE_HYBRID : + SHADER_REDUCTION_MODE_SHMEM; + + for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) { + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32", arr_dmmv_f32_f32_f32_len[reduc], arr_dmmv_f32_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {wg_size_subgroup, 1, i+1}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32", arr_dmmv_f16_f32_f32_len[reduc], arr_dmmv_f16_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f32_f32", arr_dmmv_bf16_f32_f32_len[reduc], arr_dmmv_bf16_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q1_0][i], "mul_mat_vec_q1_0_f32_f32", arr_dmmv_q1_0_f32_f32_len[reduc], arr_dmmv_q1_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32", arr_dmmv_q4_0_f32_f32_len[reduc], arr_dmmv_q4_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32", arr_dmmv_q4_1_f32_f32_len[reduc], arr_dmmv_q4_1_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32", arr_dmmv_q5_0_f32_f32_len[reduc], arr_dmmv_q5_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f32_f32", arr_dmmv_q5_1_f32_f32_len[reduc], arr_dmmv_q5_1_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f32_f32", arr_dmmv_q8_0_f32_f32_len[reduc], arr_dmmv_q8_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f32_f32", arr_dmmv_q2_k_f32_f32_len[reduc16], arr_dmmv_q2_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f32_f32", arr_dmmv_q3_k_f32_f32_len[reduc16], arr_dmmv_q3_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32", arr_dmmv_q4_k_f32_f32_len[reduc16], arr_dmmv_q4_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32", arr_dmmv_q5_k_f32_f32_len[reduc16], arr_dmmv_q5_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32", arr_dmmv_q6_k_f32_f32_len[reduc16], arr_dmmv_q6_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f32_f32", arr_dmmv_iq1_s_f32_f32_len[reduc16], arr_dmmv_iq1_s_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f32_f32", arr_dmmv_iq1_m_f32_f32_len[reduc16], arr_dmmv_iq1_m_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32", arr_dmmv_iq2_xxs_f32_f32_len[reduc16], arr_dmmv_iq2_xxs_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f32_f32", arr_dmmv_iq2_xs_f32_f32_len[reduc16], arr_dmmv_iq2_xs_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32", arr_dmmv_iq2_s_f32_f32_len[reduc16], arr_dmmv_iq2_s_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32", arr_dmmv_iq3_xxs_f32_f32_len[reduc16], arr_dmmv_iq3_xxs_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32", arr_dmmv_iq3_s_f32_f32_len[reduc16], arr_dmmv_iq3_s_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32", arr_dmmv_iq4_xs_f32_f32_len[reduc16], arr_dmmv_iq4_xs_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32", arr_dmmv_iq4_nl_f32_f32_len[reduc16], arr_dmmv_iq4_nl_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f32_f32", arr_dmmv_mxfp4_f32_f32_len[reduc16], arr_dmmv_mxfp4_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_NVFP4][i], "mul_mat_vec_nvfp4_f32_f32", arr_dmmv_nvfp4_f32_f32_len[reduc16], arr_dmmv_nvfp4_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32", arr_dmmv_f32_f16_f32_len[reduc], arr_dmmv_f32_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {wg_size_subgroup, 1, i+1}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32", arr_dmmv_f16_f16_f32_len[reduc], arr_dmmv_f16_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f16_f32", arr_dmmv_bf16_f16_f32_len[reduc], arr_dmmv_bf16_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q1_0][i], "mul_mat_vec_q1_0_f16_f32", arr_dmmv_q1_0_f16_f32_len[reduc], arr_dmmv_q1_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32", arr_dmmv_q4_0_f16_f32_len[reduc], arr_dmmv_q4_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32", arr_dmmv_q4_1_f16_f32_len[reduc], arr_dmmv_q4_1_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32", arr_dmmv_q5_0_f16_f32_len[reduc], arr_dmmv_q5_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f16_f32", arr_dmmv_q5_1_f16_f32_len[reduc], arr_dmmv_q5_1_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f16_f32", arr_dmmv_q8_0_f16_f32_len[reduc], arr_dmmv_q8_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f16_f32", arr_dmmv_q2_k_f16_f32_len[reduc16], arr_dmmv_q2_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f16_f32", arr_dmmv_q3_k_f16_f32_len[reduc16], arr_dmmv_q3_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32", arr_dmmv_q4_k_f16_f32_len[reduc16], arr_dmmv_q4_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32", arr_dmmv_q5_k_f16_f32_len[reduc16], arr_dmmv_q5_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32", arr_dmmv_q6_k_f16_f32_len[reduc16], arr_dmmv_q6_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f16_f32", arr_dmmv_iq1_s_f16_f32_len[reduc16], arr_dmmv_iq1_s_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f16_f32", arr_dmmv_iq1_m_f16_f32_len[reduc16], arr_dmmv_iq1_m_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32", arr_dmmv_iq2_xxs_f16_f32_len[reduc16], arr_dmmv_iq2_xxs_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f16_f32", arr_dmmv_iq2_xs_f16_f32_len[reduc16], arr_dmmv_iq2_xs_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32", arr_dmmv_iq2_s_f16_f32_len[reduc16], arr_dmmv_iq2_s_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32", arr_dmmv_iq3_xxs_f16_f32_len[reduc16], arr_dmmv_iq3_xxs_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32", arr_dmmv_iq3_s_f16_f32_len[reduc16], arr_dmmv_iq3_s_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32", arr_dmmv_iq4_xs_f16_f32_len[reduc16], arr_dmmv_iq4_xs_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32", arr_dmmv_iq4_nl_f16_f32_len[reduc16], arr_dmmv_iq4_nl_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f16_f32", arr_dmmv_mxfp4_f16_f32_len[reduc16], arr_dmmv_mxfp4_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_NVFP4][i], "mul_mat_vec_nvfp4_f16_f32", arr_dmmv_nvfp4_f16_f32_len[reduc16], arr_dmmv_nvfp4_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + +#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) + if (device->integer_dot_product) { + const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size; + const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4); + + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_q8_1_f32", arr_dmmv_q4_0_q8_1_f32_len[reduc], arr_dmmv_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_q8_1_f32", arr_dmmv_q4_1_q8_1_f32_len[reduc], arr_dmmv_q4_1_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_q8_1_f32", arr_dmmv_q5_0_q8_1_f32_len[reduc], arr_dmmv_q5_0_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_q8_1_f32", arr_dmmv_q5_1_q8_1_f32_len[reduc], arr_dmmv_q5_1_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_q8_1_f32", arr_dmmv_q8_0_q8_1_f32_len[reduc], arr_dmmv_q8_0_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); + + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_q8_1_f32", arr_dmmv_mxfp4_q8_1_f32_len[reduc], arr_dmmv_mxfp4_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); + + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_q8_1_f32", arr_dmmv_q2_k_q8_1_f32_len[reduc], arr_dmmv_q2_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_q8_1_f32", arr_dmmv_q3_k_q8_1_f32_len[reduc], arr_dmmv_q3_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_q8_1_f32", arr_dmmv_q4_k_q8_1_f32_len[reduc], arr_dmmv_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_q8_1_f32", arr_dmmv_q5_k_q8_1_f32_len[reduc], arr_dmmv_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_q8_1_f32", arr_dmmv_q6_k_q8_1_f32_len[reduc], arr_dmmv_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); + + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_q8_1_f32", arr_dmmv_iq1_s_q8_1_f32_len[reduc], arr_dmmv_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(i), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(i), i+1}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_q8_1_f32", arr_dmmv_iq1_m_q8_1_f32_len[reduc], arr_dmmv_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(i), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(i), i+1}, 1, true, use_subgroups, subgroup_size_int); + + } +#endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT + } + + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", arr_dmmv_id_f32_f32_f32_len[reduc], arr_dmmv_id_f32_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {wg_size_subgroup, 1}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", arr_dmmv_id_f16_f32_f32_len[reduc], arr_dmmv_id_f16_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {wg_size_subgroup, 2}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_BF16], "mul_mat_vec_id_bf16_f32", arr_dmmv_id_bf16_f32_f32_len[reduc], arr_dmmv_id_bf16_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {wg_size_subgroup, 2}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q1_0], "mul_mat_vec_id_q1_0_f32", arr_dmmv_id_q1_0_f32_f32_len[reduc], arr_dmmv_id_q1_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", arr_dmmv_id_q4_0_f32_f32_len[reduc], arr_dmmv_id_q4_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", arr_dmmv_id_q4_1_f32_f32_len[reduc], arr_dmmv_id_q4_1_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", arr_dmmv_id_q5_0_f32_f32_len[reduc], arr_dmmv_id_q5_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", arr_dmmv_id_q5_1_f32_f32_len[reduc], arr_dmmv_id_q5_1_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", arr_dmmv_id_q8_0_f32_f32_len[reduc], arr_dmmv_id_q8_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", arr_dmmv_id_q2_k_f32_f32_len[reduc16], arr_dmmv_id_q2_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", arr_dmmv_id_q3_k_f32_f32_len[reduc16], arr_dmmv_id_q3_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", arr_dmmv_id_q4_k_f32_f32_len[reduc16], arr_dmmv_id_q4_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", arr_dmmv_id_q5_k_f32_f32_len[reduc16], arr_dmmv_id_q5_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", arr_dmmv_id_q6_k_f32_f32_len[reduc16], arr_dmmv_id_q6_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_f32", arr_dmmv_id_iq1_s_f32_f32_len[reduc16], arr_dmmv_id_iq1_s_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_f32", arr_dmmv_id_iq1_m_f32_f32_len[reduc16], arr_dmmv_id_iq1_m_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ2_XXS], "mul_mat_vec_id_iq2_xxs_f32", arr_dmmv_id_iq2_xxs_f32_f32_len[reduc16], arr_dmmv_id_iq2_xxs_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ2_XS], "mul_mat_vec_id_iq2_xs_f32", arr_dmmv_id_iq2_xs_f32_f32_len[reduc16], arr_dmmv_id_iq2_xs_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ2_S], "mul_mat_vec_id_iq2_s_f32", arr_dmmv_id_iq2_s_f32_f32_len[reduc16], arr_dmmv_id_iq2_s_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", arr_dmmv_id_iq3_xxs_f32_f32_len[reduc16], arr_dmmv_id_iq3_xxs_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ3_S], "mul_mat_vec_id_iq3_s_f32", arr_dmmv_id_iq3_s_f32_f32_len[reduc16], arr_dmmv_id_iq3_s_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ4_XS], "mul_mat_vec_id_iq4_xs_f32", arr_dmmv_id_iq4_xs_f32_f32_len[reduc16], arr_dmmv_id_iq4_xs_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", arr_dmmv_id_iq4_nl_f32_f32_len[reduc16], arr_dmmv_id_iq4_nl_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_f32", arr_dmmv_id_mxfp4_f32_f32_len[reduc16], arr_dmmv_id_mxfp4_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_NVFP4], "mul_mat_vec_id_nvfp4_f32", arr_dmmv_id_nvfp4_f32_f32_len[reduc16], arr_dmmv_id_nvfp4_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); + +#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) + if (device->integer_dot_product) { + const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size; + const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4); + + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_q8_1_f32", arr_dmmv_id_q4_0_q8_1_f32_len[reduc], arr_dmmv_id_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_q8_1_f32", arr_dmmv_id_q4_1_q8_1_f32_len[reduc], arr_dmmv_id_q4_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_q8_1_f32", arr_dmmv_id_q5_0_q8_1_f32_len[reduc], arr_dmmv_id_q5_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_q8_1_f32", arr_dmmv_id_q5_1_q8_1_f32_len[reduc], arr_dmmv_id_q5_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_q8_1_f32", arr_dmmv_id_q8_0_q8_1_f32_len[reduc], arr_dmmv_id_q8_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); + + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_q8_1_f32", arr_dmmv_id_mxfp4_q8_1_f32_len[reduc], arr_dmmv_id_mxfp4_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); + + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_q8_1_f32", arr_dmmv_id_q2_k_q8_1_f32_len[reduc], arr_dmmv_id_q2_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_q8_1_f32", arr_dmmv_id_q3_k_q8_1_f32_len[reduc], arr_dmmv_id_q3_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); + + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_q8_1_f32", arr_dmmv_id_iq1_s_q8_1_f32_len[reduc], arr_dmmv_id_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_q8_1_f32", arr_dmmv_id_iq1_m_q8_1_f32_len[reduc], arr_dmmv_id_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int); + } +#endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT + } + +#if !defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) + GGML_UNUSED(rm_stdq_int); + GGML_UNUSED(rm_kq_int); + GGML_UNUSED(rm_iq_int); +#endif + + // dequant shaders + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q1_0], "dequant_q1_0", dequant_q1_0_len, dequant_q1_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 8, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ1_S], "dequant_iq1_s", dequant_iq1_s_len, dequant_iq1_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ1_M], "dequant_iq1_m", dequant_iq1_m_len, dequant_iq1_m_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_XXS], "dequant_iq2_xxs", dequant_iq2_xxs_len, dequant_iq2_xxs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_XS], "dequant_iq2_xs", dequant_iq2_xs_len, dequant_iq2_xs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_S], "dequant_iq2_s", dequant_iq2_s_len, dequant_iq2_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_XXS], "dequant_iq3_xxs", dequant_iq3_xxs_len, dequant_iq3_xxs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_S], "dequant_iq3_s", dequant_iq3_s_len, dequant_iq3_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_XS], "dequant_iq4_xs", dequant_iq4_xs_len, dequant_iq4_xs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_MXFP4], "dequant_mxfp4", dequant_mxfp4_len, dequant_mxfp4_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_NVFP4], "dequant_nvfp4", dequant_nvfp4_len, dequant_nvfp4_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); + + // get_rows + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_BF16], "get_rows_bf16", get_rows_bf16_len, get_rows_bf16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q1_0], "get_rows_q1_0", get_rows_q1_0_len, get_rows_q1_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q2_K], "get_rows_q2_k", get_rows_q2_k_len, get_rows_q2_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q3_K], "get_rows_q3_k", get_rows_q3_k_len, get_rows_q3_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_K], "get_rows_q4_k", get_rows_q4_k_len, get_rows_q4_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_K], "get_rows_q5_k", get_rows_q5_k_len, get_rows_q5_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q6_K], "get_rows_q6_k", get_rows_q6_k_len, get_rows_q6_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ1_S], "get_rows_iq1_s", get_rows_iq1_s_len, get_rows_iq1_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ1_M], "get_rows_iq1_m", get_rows_iq1_m_len, get_rows_iq1_m_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_XXS], "get_rows_iq2_xxs", get_rows_iq2_xxs_len, get_rows_iq2_xxs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_XS], "get_rows_iq2_xs", get_rows_iq2_xs_len, get_rows_iq2_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_S], "get_rows_iq2_s", get_rows_iq2_s_len, get_rows_iq2_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs", get_rows_iq3_xxs_len, get_rows_iq3_xxs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_S], "get_rows_iq3_s", get_rows_iq3_s_len, get_rows_iq3_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs", get_rows_iq4_xs_len, get_rows_iq4_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4], "get_rows_mxfp4", get_rows_mxfp4_len, get_rows_mxfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_NVFP4], "get_rows_nvfp4", get_rows_nvfp4_len, get_rows_nvfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32], "get_rows_i32", get_rows_i32_len, get_rows_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_BF16], "get_rows_bf16_f32", get_rows_bf16_f32_len, get_rows_bf16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q1_0], "get_rows_q1_0_f32", get_rows_q1_0_f32_len, get_rows_q1_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q2_K], "get_rows_q2_k_f32", get_rows_q2_k_f32_len, get_rows_q2_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q3_K], "get_rows_q3_k_f32", get_rows_q3_k_f32_len, get_rows_q3_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_K], "get_rows_q4_k_f32", get_rows_q4_k_f32_len, get_rows_q4_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_K], "get_rows_q5_k_f32", get_rows_q5_k_f32_len, get_rows_q5_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q6_K], "get_rows_q6_k_f32", get_rows_q6_k_f32_len, get_rows_q6_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ1_S], "get_rows_iq1_s_f32", get_rows_iq1_s_f32_len, get_rows_iq1_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ1_M], "get_rows_iq1_m_f32", get_rows_iq1_m_f32_len, get_rows_iq1_m_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_XXS], "get_rows_iq2_xxs_f32", get_rows_iq2_xxs_f32_len, get_rows_iq2_xxs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_XS], "get_rows_iq2_xs_f32", get_rows_iq2_xs_f32_len, get_rows_iq2_xs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_S], "get_rows_iq2_s_f32", get_rows_iq2_s_f32_len, get_rows_iq2_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs_f32", get_rows_iq3_xxs_f32_len, get_rows_iq3_xxs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_S], "get_rows_iq3_s_f32", get_rows_iq3_s_f32_len, get_rows_iq3_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs_f32", get_rows_iq4_xs_f32_len, get_rows_iq4_xs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_MXFP4], "get_rows_mxfp4_f32", get_rows_mxfp4_f32_len, get_rows_mxfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_NVFP4], "get_rows_nvfp4_f32", get_rows_nvfp4_f32_len, get_rows_nvfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, sizeof(vk_op_flash_attn_split_k_reduce_push_constants), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true); + + for (auto &it : device->pipeline_fa_mask_opt) { + auto BrBc = it.first; + ggml_vk_create_pipeline(device, it.second, "fa_mask_opt", fa_mask_opt_len, fa_mask_opt_data, "main", 2, sizeof(vk_op_flash_attn_mask_opt_push_constants), {1, 1, 1}, {128, 128 / device->subgroup_size, BrBc.first, BrBc.second}, 1, true, true, device->subgroup_size); + } + + if (device->subgroup_clustered && device->subgroup_require_full_support) { + ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true); + } else { + ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1); + } + + for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) { + if (device->subgroup_arithmetic && device->subgroup_require_full_support) { + ggml_vk_create_pipeline2(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_subgroup_add_len, mul_mat_vec_p021_f16_f32_subgroup_add_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_p021_push_constants), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true, true); + } else { + ggml_vk_create_pipeline2(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_p021_push_constants), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true); + } + } + ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_nc_push_constants), {1, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_f32, "rms_norm_mul_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_partials_f32, "rms_norm_partials_f32", rms_norm_partials_f32_len, rms_norm_partials_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_partials_f32, "rms_norm_mul_partials_f32", rms_norm_partials_f32_len, rms_norm_partials_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 1}, 1, true); + + if (sizeof(vk_op_rms_norm_mul_rope_push_constants) <= device->properties.limits.maxPushConstantsSize) { + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_rope_f32_f32, "rms_norm_mul_rope_f32_f32", rms_norm_mul_rope_f32_f32_len, rms_norm_mul_rope_f32_f32_data, "main", 7, sizeof(vk_op_rms_norm_mul_rope_push_constants), {1, 1, 1}, {0, 1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_rope_f32_f16, "rms_norm_mul_rope_f32_f16", rms_norm_mul_rope_f32_f16_len, rms_norm_mul_rope_f32_f16_data, "main", 7, sizeof(vk_op_rms_norm_mul_rope_push_constants), {1, 1, 1}, {0, 1}, 1, true); + } + + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f32, "cpy_f16_f32", cpy_f16_f32_len, cpy_f16_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_bf16,"cpy_f32_bf16",cpy_f32_bf16_len,cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_bf16_f32,"cpy_bf16_f32",cpy_bf16_f32_len,cpy_bf16_f32_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_i32_f32, "cpy_i32_f32", cpy_i32_f32_len, cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_i32, "cpy_f32_i32", cpy_f32_i32_len, cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f32, "contig_cpy_f16_f32", contig_cpy_f16_f32_len, contig_cpy_f16_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_bf16,"contig_cpy_f32_bf16",contig_cpy_f32_bf16_len,contig_cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_bf16_f32,"contig_cpy_bf16_f32",contig_cpy_bf16_f32_len,contig_cpy_bf16_f32_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_i32_f32, "contig_cpy_i32_f32", contig_cpy_i32_f32_len, contig_cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_i32, "contig_cpy_f32_i32", contig_cpy_f32_i32_len, contig_cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_32, "cpy_transpose_32", cpy_transpose_32_len, cpy_transpose_32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_16, "cpy_transpose_16", cpy_transpose_16_len, cpy_transpose_16_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q1_0], "cpy_f32_q1_0", cpy_f32_q1_0_len, cpy_f32_q1_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + +#define SET_ROWS(itype) \ + ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F32], "set_rows_f32" #itype, set_rows_f32 ## itype ## _len, set_rows_f32 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F16], "set_rows_f16" #itype, set_rows_f16 ## itype ## _len, set_rows_f16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_BF16], "set_rows_bf16" #itype, set_rows_bf16 ## itype ## _len, set_rows_bf16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q1_0], "set_rows_q1_0" #itype, set_rows_q1_0 ## itype ## _len, set_rows_q1_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_0], "set_rows_q4_0" #itype, set_rows_q4_0 ## itype ## _len, set_rows_q4_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_1], "set_rows_q4_1" #itype, set_rows_q4_1 ## itype ## _len, set_rows_q4_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_0], "set_rows_q5_0" #itype, set_rows_q5_0 ## itype ## _len, set_rows_q5_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_1], "set_rows_q5_1" #itype, set_rows_q5_1 ## itype ## _len, set_rows_q5_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q8_0], "set_rows_q8_0" #itype, set_rows_q8_0 ## itype ## _len, set_rows_q8_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_IQ4_NL], "set_rows_iq4_nl" #itype, set_rows_iq4_nl ## itype ## _len, set_rows_iq4_nl ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + + SET_ROWS(_i32) + SET_ROWS(_i64) +#undef SET_ROWS + + + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q1_0], "cpy_q1_0_f32", cpy_q1_0_f32_len, cpy_q1_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q1_0), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_1], "cpy_q4_1_f32", cpy_q4_1_f32_len, cpy_q4_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_0], "cpy_q5_0_f32", cpy_q5_0_f32_len, cpy_q5_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_1], "cpy_q5_1_f32", cpy_q5_1_f32_len, cpy_q5_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q8_0], "cpy_q8_0_f32", cpy_q8_0_f32_len, cpy_q8_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_IQ4_NL], "cpy_iq4_nl_f32", cpy_iq4_nl_f32_len, cpy_iq4_nl_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1); + + auto get_suffix = [](bool src0_f16, bool src1_f16, bool dst_f16) { + std::string s; + s += std::string(src0_f16 ? "_f16" : "_f32"); + s += std::string(src1_f16 ? "_f16" : "_f32"); + s += std::string(dst_f16 ? "_f16" : "_f32"); + return s; + }; + +#define CREATE_BINARY(name, namemod, spec, bindings) \ + for (int s0 : {0,1}) for (int s1 : {0,1}) for (int d : {0,1}) \ + ggml_vk_create_pipeline2(device, device->pipeline_ ## name ## namemod[s0][s1][d], \ + #name + get_suffix(s0, s1, d) + #namemod, name ## _len[s0][s1][d], name ## _data[s0][s1][d], \ + "main", (bindings), sizeof(vk_op_binary_push_constants), {512, 1, 1}, spec, 1); + + CREATE_BINARY(add, , {0}, 4) + CREATE_BINARY(add, _norepeat, {1}, 4) + CREATE_BINARY(sub, , {0}, 3) + CREATE_BINARY(sub, _norepeat, {1}, 3) + CREATE_BINARY(mul, , {0}, 3) + CREATE_BINARY(mul, _norepeat, {1}, 3) + CREATE_BINARY(div, , {0}, 3) + CREATE_BINARY(div, _norepeat, {1}, 3) + CREATE_BINARY(add_rms, , {0}, 4) + CREATE_BINARY(add_rms, _norepeat, {1}, 4) +#undef CREATE_BINARY + + if (device->multi_add) { + for (uint32_t i = 0; i < MAX_FUSED_ADDS; ++i) { + ggml_vk_create_pipeline2(device, device->pipeline_multi_add[i], "multi_add_f32_" + std::to_string(i+1), multi_add_f32_len, multi_add_f32_data, "main", MAX_PARAMETER_COUNT, sizeof(vk_op_multi_add_push_constants), {512, 1, 1}, {i+2}, 1); + ggml_vk_create_pipeline2(device, device->pipeline_multi_add_rms[i], "multi_add_rms_f32_" + std::to_string(i+1), multi_add_rms_f32_len, multi_add_rms_f32_data, "main", MAX_PARAMETER_COUNT, sizeof(vk_op_multi_add_push_constants), {512, 1, 1}, {i+2}, 1); + } + } + + ggml_vk_create_pipeline(device, device->pipeline_add_id_f32, "add_id_f32", add_id_f32_len, add_id_f32_data, "main", 4, sizeof(vk_op_add_id_push_constants), {1, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0, 1}, 1); + ggml_vk_create_pipeline(device, device->pipeline_set_f32, "set_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0, 0}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_concat_i8, "concat_i8", concat_i8_len, concat_i8_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_concat_i16, "concat_i16", concat_i16_len, concat_i16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_concat_i64, "concat_i64", concat_i64_len, concat_i64_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1); + ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1); + ggml_vk_create_pipeline(device, device->pipeline_upscale_bicubic_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BICUBIC}, 1); + ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_antialias_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_sqrt_f32, "sqrt_f32", sqrt_f32_len, sqrt_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_log[0], "log_f32", log_f32_len, log_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_log[1], "log_f16", log_f16_len, log_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_diag[0], "diag_f32", diag_f32_len, diag_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_diag[1], "diag_f16", diag_f16_len, diag_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_repeat_i32, "repeat_i32", repeat_i32_len, repeat_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_repeat_i16, "repeat_i16", repeat_i16_len, repeat_i16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + +#define CREATE_UNARY(name) \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + + CREATE_UNARY(elu) + CREATE_UNARY(gelu) + CREATE_UNARY(gelu_erf) + CREATE_UNARY(gelu_quick) + CREATE_UNARY(silu) + CREATE_UNARY(relu) + CREATE_UNARY(xielu) + CREATE_UNARY(neg) + CREATE_UNARY(tanh) + CREATE_UNARY(sigmoid) + CREATE_UNARY(hardsigmoid) + CREATE_UNARY(hardswish) + CREATE_UNARY(abs) + CREATE_UNARY(softplus) + CREATE_UNARY(step) + CREATE_UNARY(round) + CREATE_UNARY(ceil) + CREATE_UNARY(floor) + CREATE_UNARY(trunc) + CREATE_UNARY(sgn) + CREATE_UNARY(exp) + CREATE_UNARY(expm1) +#undef CREATE_UNARY + + ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f16, "add1_f16_f16", add1_f16_f16_len, add1_f16_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f32, "add1_f16_f32", add1_f16_f32_len, add1_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_add1_f32_f32, "add1_f32_f32", add1_f32_f32_len, add1_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_fill_f16, "fill_f16", fill_f16_len, fill_f16_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + +#define CREATE_GLU(name) \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); + + CREATE_GLU(geglu) + CREATE_GLU(reglu) + CREATE_GLU(swiglu) + CREATE_GLU(swiglu_oai) + CREATE_GLU(geglu_erf) + CREATE_GLU(geglu_quick) +#undef CREATE_GLU + + ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {1, 512, 1}, {}, 1, true); + + ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_wg512, "soft_max_f32_wg512", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true); + + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32, "soft_max_large1_f32", soft_max_large1_f32_len, soft_max_large1_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32, "soft_max_large2_f32", soft_max_large2_f32_len, soft_max_large2_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32, "soft_max_large3_f32", soft_max_large3_f32_len, soft_max_large3_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32_f16, "soft_max_large1_f32_f16", soft_max_large1_f32_f16_len, soft_max_large1_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32_f16, "soft_max_large2_f32_f16", soft_max_large2_f32_f16_len, soft_max_large2_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32_f16, "soft_max_large3_f32_f16", soft_max_large3_f32_f16_len, soft_max_large3_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + + ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f32, "rope_vision_f32", rope_vision_f32_len, rope_vision_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f16, "rope_multi_f16", rope_multi_f16_len, rope_multi_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f16, "rope_vision_f16", rope_vision_f16_len, rope_vision_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32_f16, "rope_norm_f32_f16", rope_norm_f32_f16_len, rope_norm_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32_f16, "rope_neox_f32_f16", rope_neox_f32_f16_len, rope_neox_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32_f16, "rope_multi_f32_f16", rope_multi_f32_f16_len, rope_multi_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + + for (uint32_t i = 0; i < num_argsort_pipelines; ++i) { + uint32_t BLOCK_SIZE = 1u << std::min(i, device->max_workgroup_size_log2); + if (i <= device->max_workgroup_size_log2 && + 2 * sizeof(int) * BLOCK_SIZE <= device->properties.limits.maxComputeSharedMemorySize) { + const uint32_t NCOLS_PADDED_LOG2 = i; + ggml_vk_create_pipeline2(device, device->pipeline_argsort_f32[i], "argsort_f32_"+std::to_string(i), argsort_f32_len, argsort_f32_data, "main", 3, sizeof(vk_op_argsort_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, NCOLS_PADDED_LOG2}, 1, true); + } + const uint32_t WG_UNROLL_FACTOR = BLOCK_SIZE > 1 ? 2 : 1; + BLOCK_SIZE /= WG_UNROLL_FACTOR; + ggml_vk_create_pipeline2(device, device->pipeline_argsort_large_f32[i], "argsort_large_f32_"+std::to_string(i), argsort_large_f32_len, argsort_large_f32_data, "main", 3, sizeof(vk_op_argsort_push_constants), {BLOCK_SIZE * WG_UNROLL_FACTOR, 1, 1}, {BLOCK_SIZE, WG_UNROLL_FACTOR}, 1, true); + } + + for (uint32_t i = 0; i < num_topk_pipelines; ++i) { + const uint32_t BLOCK_SIZE = 1u << i; + const uint32_t NCOLS_PADDED_LOG2 = i; + if (i <= device->max_workgroup_size_log2) { + uint32_t nary_shmem = 2 * sizeof(int) * BLOCK_SIZE + + sizeof(int) * device->subgroup_size + + 2 * sizeof(int) + + 2 * (BLOCK_SIZE / device->subgroup_size) * sizeof(int); + if (device->subgroup_arithmetic && device->subgroup_require_full_support && device->subgroup_shuffle && device->subgroup_ballot && + nary_shmem <= device->properties.limits.maxComputeSharedMemorySize) { + ggml_vk_create_pipeline2(device, device->pipeline_topk_f32[i], "topk_f32_"+std::to_string(i), topk_nary_search_f32_len, topk_nary_search_f32_data, "main", 2, sizeof(vk_op_topk_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, device->subgroup_size, device->subgroup_size_log2}, 1, true, true, device->subgroup_size); + } else if (2 * sizeof(int) * BLOCK_SIZE <= device->properties.limits.maxComputeSharedMemorySize) { + ggml_vk_create_pipeline2(device, device->pipeline_topk_f32[i], "topk_f32_"+std::to_string(i), topk_argsort_f32_len, topk_argsort_f32_data, "main", 2, sizeof(vk_op_topk_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, NCOLS_PADDED_LOG2}, 1, true); + } + } + } + + ggml_vk_create_pipeline(device, device->pipeline_argmax_f32, "argmax_f32", argmax_f32_len, argmax_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + + ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + // Intel Arc B390 was observed segfaulting with this shader. + if (device->subgroup_basic && device->subgroup_shuffle && device->vendor_id != VK_VENDOR_ID_INTEL) { + int idx = 0; + for (uint32_t n : {64, 128, 256, 512}) { + if (device->subgroup_size <= n) { + ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_f32", fwht_f32_len, fwht_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { device->subgroup_size, n }, 1, true, true, device->subgroup_size); + } + ++idx; + } + } else if (device->driver_id != vk::DriverId::eIntelProprietaryWindows) { + // Disabled on Intel Windows due to a driver bug: https://github.com/ggml-org/llama.cpp/pull/23964#issuecomment-4598226147 + int idx = 0; + for (uint32_t n : {64, 128, 256, 512}) { + const uint32_t block_size = std::min(device->subgroup_size, n); + ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_shmem_f32", fwht_shmem_f32_len, fwht_shmem_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { block_size, n }, 1); + ++idx; + } + } + + const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4; + ggml_vk_create_pipeline(device, device->pipeline_cumsum_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 256, device->subgroup_size, cumsum_elem_per_thread }, 1, true, true, device->subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_cumsum_small_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 128, device->subgroup_size, 1 }, 1, true, true, device->subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass1_f32, "cumsum_multipass1_f32", cumsum_multipass1_f32_len, cumsum_multipass1_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, device->subgroup_size }, 1, true, true, device->subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass2_f32, "cumsum_multipass2_f32", cumsum_multipass2_f32_len, cumsum_multipass2_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, device->subgroup_size }, 1, true, true, device->subgroup_size); + + ggml_vk_create_pipeline(device, device->pipeline_count_equal_i32, "count_equal_i32", count_equal_i32_len, count_equal_i32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, { device->subgroup_size }, 1); + + ggml_vk_create_pipeline(device, device->pipeline_count_experts, "count_experts", count_experts_len, count_experts_data, "main", 2, sizeof(vk_op_count_experts_push_constants), {1, 1, 1}, {}, 1, true); + + for (auto &s : device->pipeline_solve_tri_f32) { + const vk_solve_tri_pipeline_state &state = s.first; + + // Max number of rows to load at a time, limited by shared memory + const uint32_t batch_N = device->properties.limits.maxComputeSharedMemorySize / ((state.N + state.K) * sizeof(float)); + // Need at least K invocations, and prefer a minimum of 128 to spread out loading shared memory + const uint32_t block_size = std::max(128u, 1u << (uint32_t)ceilf(log2f(float(state.K)))); + + ggml_vk_create_pipeline( + device, s.second, "solve_tri_f32", + solve_tri_f32_len, solve_tri_f32_data, "main", 3, + sizeof(vk_op_binary_push_constants), {1, 1, 1}, { 0, state.N, state.K, batch_N, block_size }, 1, true); + } + +#define IM2COL(bda) \ + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32 ## bda ## _len, im2col_f32 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32, "im2col_3d_f32", im2col_3d_f32 ## bda ## _len, im2col_3d_f32 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16 ## bda ## _len, im2col_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32_f16, "im2col_3d_f32_f16", im2col_3d_f32_f16 ## bda ## _len, im2col_3d_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); + if (device->shader_int64 && device->buffer_device_address) { + IM2COL(_bda) + } else { + IM2COL() + } + + ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_f32, "col2im_1d_f32", col2im_1d_f32_len, col2im_1d_f32_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_f16, "col2im_1d_f16", col2im_1d_f16_len, col2im_1d_f16_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_bf16, "col2im_1d_bf16", col2im_1d_bf16_len, col2im_1d_bf16_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true); + + ggml_vk_create_pipeline(device, device->pipeline_snake_f32, "snake_f32", snake_f32_len, snake_f32_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_snake_f16, "snake_f16", snake_f16_len, snake_f16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_snake_bf16, "snake_bf16", snake_bf16_len, snake_bf16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); + + { + const uint32_t gdn_sizes[] = {16, 32, 64, 128}; + const char * gdn_names[][2] = { + {"gated_delta_net_f32_d16", "gated_delta_net_f32_d16_kda"}, + {"gated_delta_net_f32_d32", "gated_delta_net_f32_d32_kda"}, + {"gated_delta_net_f32_d64", "gated_delta_net_f32_d64_kda"}, + {"gated_delta_net_f32_d128", "gated_delta_net_f32_d128_kda"}, + }; + for (uint32_t si = 0; si < 4; si++) { + const uint32_t S_V = gdn_sizes[si]; + GGML_ASSERT(is_pow2(S_V)); + + uint32_t lanes_per_column; + if (S_V >= 128u && device->subgroup_clustered) { + lanes_per_column = 8u; + } else { + // Use largest power-of-two that divides both S_V and subgroup_size so that + // (1) S_V % lanes_per_column == 0 and (2) S_V % (subgroup_size / lanes_per_column) == 0. + // This means we don't need extra bounds checking logic in the shader. + lanes_per_column = std::min(S_V, device->subgroup_size); + } + + // gated_delta_net.comp relies on S_V % COLS_PER_WG == 0 and + // S_V % LANES_PER_COLUMN == 0 to avoid bounds checks. + while (lanes_per_column > 1u) { + const bool valid_lanes = (device->subgroup_size % lanes_per_column) == 0 && + (S_V % lanes_per_column) == 0; + const uint32_t cols_per_wg = valid_lanes ? device->subgroup_size / lanes_per_column : 0; + if (valid_lanes && cols_per_wg > 0 && (S_V % cols_per_wg) == 0) { + break; + } + lanes_per_column >>= 1u; + } + + GGML_ASSERT((device->subgroup_size % lanes_per_column) == 0); + GGML_ASSERT((S_V % lanes_per_column) == 0); + GGML_ASSERT((S_V % (device->subgroup_size / lanes_per_column)) == 0); + + const bool need_partial_subgroup_reduce = lanes_per_column != 1u && lanes_per_column < device->subgroup_size; + const bool use_clustered_reduce = device->subgroup_arithmetic && device->subgroup_clustered && need_partial_subgroup_reduce; + const bool use_subgroup_reduce = device->subgroup_arithmetic && !need_partial_subgroup_reduce; + const bool use_subgroup_ops = use_clustered_reduce || use_subgroup_reduce; + size_t gdn_len; + const void * gdn_data; + if (use_clustered_reduce) { + gdn_len = gated_delta_net_f32_len; + gdn_data = (const void *)gated_delta_net_f32_data; + } else if (use_subgroup_reduce) { + gdn_len = gated_delta_net_f32_nocluster_len; + gdn_data = (const void *)gated_delta_net_f32_nocluster_data; + } else { + gdn_len = gated_delta_net_f32_shmem_len; + gdn_data = (const void *)gated_delta_net_f32_shmem_data; + } + + const uint32_t cols_per_wg = device->subgroup_size / lanes_per_column; + const std::array wg_denoms = {1u, 1u, cols_per_wg}; + + for (uint32_t kda = 0; kda < 2; kda++) { + ggml_vk_create_pipeline(device, device->pipeline_gated_delta_net[si][kda], + gdn_names[si][kda], gdn_len, gdn_data, "main", 7, sizeof(vk_op_gated_delta_net_push_constants), + wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_ops, device->subgroup_size); + } + } + } + + if (device->subgroup_arithmetic && device->subgroup_require_full_support) { + ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size}, 1, true, true); + ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size}, 1, true, true); + } else { + ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true); + ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true); + } + + ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_f32, "ssm_conv_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 4, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16, 0, 0}, 1); + ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_silu_f32, "ssm_conv_silu_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 4, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16, 0, 1}, 1); + ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_bias_silu_f32, "ssm_conv_bias_silu_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 4, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16, 1, 1}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_opt_step_sgd_f32, "opt_step_sgd_f32", opt_step_sgd_f32_len, opt_step_sgd_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + + // conv2d, conv_transpose_2d + for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) { + // smaller WG for the small-tile fallback gives more concurrent WGs per SM + uint32_t conv2d_WG_SIZE = (s == CONV_SHAPE_64x32) ? 128 : 256; + uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices. + uint32_t conv2d_TS_K = (s == CONV_SHAPE_64x32) ? 4 : 8; + uint32_t conv2d_SHMEM_PAD = 4; + vk_conv_block_size conv2d_BS = vk_conv_block_sizes[s]; + bool conv2d_UNROLL = true; + +#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + if (device->coopmat2) { + conv2d_SHMEM_PAD = 8; // 8 float16_t + } +#endif + + if (device->vendor_id == VK_VENDOR_ID_INTEL) { + conv2d_SHMEM_PAD = 0; + conv2d_UNROLL = false; + } else if (device->vendor_id == VK_VENDOR_ID_AMD) { + conv2d_SHMEM_PAD = device->architecture == vk_device_architecture::AMD_GCN ? 1 : 4; + if (s == CONV_SHAPE_128x128 && device->architecture != vk_device_architecture::AMD_GCN) { + conv2d_UNROLL = false; + } + } + + // Use collectives on pre-Turing NVIDIA GPUs and GCN AMD cards, which had slower integer math. + bool allow_collectives_nv = device->vendor_id != VK_VENDOR_ID_NVIDIA || + device->architecture == vk_device_architecture::NVIDIA_PRE_TURING; + bool allow_collectives_amd = device->vendor_id != VK_VENDOR_ID_AMD || + device->architecture == vk_device_architecture::AMD_GCN; + + if (device->subgroup_shuffle && + device->vendor_id != VK_VENDOR_ID_INTEL && // Do not enable collectives on Intel, see PR 14316. + allow_collectives_nv && + allow_collectives_amd) { + use_collectives = 1; + conv2d_BS.CRS = std::min( + device->subgroup_size, + conv2d_BS.CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used. + } + + // cm1 is used only when cm2 is unavailable; capped at 64x128 (due to shared memory size). + // Requires 16x16x16 f16-acc since that's the fragment shape hard-coded in the shader. + // Subgroup size must be 32 or 64 (to keep WG_SIZE sane) and we need + // subgroup_size_control to force the driver to actually use it. + bool conv2d_use_cm1 = false; +#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + conv2d_use_cm1 = !device->coopmat2 && + device->coopmat_support && device->coopmat_support_16x16x16_f16acc && + device->subgroup_size_control && + (device->subgroup_size == 32 || device->subgroup_size == 64) && + s != CONV_SHAPE_128x128; +#endif + + const uint32_t conv2d_cm1_shmem_pad = 8; + + auto shmem_req = [&](uint32_t pad, bool csh_store, bool fp16_shmem) { + const uint32_t elem_size = fp16_shmem ? (uint32_t)sizeof(uint16_t) : (uint32_t)sizeof(float); + const uint32_t csh_elems = csh_store ? conv2d_BS.K * conv2d_BS.NPQ : 0u; + return (conv2d_BS.K * (conv2d_BS.CRS + pad) + conv2d_BS.CRS * (conv2d_BS.NPQ + pad) + csh_elems) * elem_size; + }; + + // coopmat1 needs to store the output through shared memory, so check up front + // whether it'll fit and disable it before applying coopmat1 parameters. + if (conv2d_use_cm1 && device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_cm1_shmem_pad, true, true)) { + conv2d_use_cm1 = false; + } + + uint32_t conv2d_WM = 16, conv2d_WN = 16; // cm1 subgroup tile, ignored otherwise + if (conv2d_use_cm1) { + conv2d_SHMEM_PAD = conv2d_cm1_shmem_pad; + // 16x16x16 fragments; pick WM/WN to keep WG_SIZE at 256 + // (i.e. 8 subgroups for sg=32, 4 subgroups for sg=64). + const bool sg64 = (device->subgroup_size == 64); + switch (s) { + case CONV_SHAPE_64x32: conv2d_WM = sg64 ? 32 : 16; conv2d_WN = 16; break; + case CONV_SHAPE_64x128: conv2d_WM = 32; conv2d_WN = sg64 ? 64 : 32; break; + case CONV_SHAPE_32x256: conv2d_WM = sg64 ? 16 : 32; conv2d_WN = sg64 ? 128 : 32; break; + default: break; + } + const uint32_t warps_M = conv2d_BS.K / conv2d_WM; + const uint32_t warps_N = conv2d_BS.NPQ / conv2d_WN; + conv2d_WG_SIZE = warps_M * warps_N * device->subgroup_size; + } + + // stage cm2 accumulator through shmem for coalesced global stores; + // skipped on 128x128 where the extra Csh footprint hurts occupancy. + // cm1 always uses the staged path. + uint32_t conv2d_csh_store = (device->coopmat2 && s != CONV_SHAPE_128x128) ? 1u : 0u; + if (conv2d_use_cm1) { + conv2d_csh_store = 1; + } + + // shmem is fp16 on cm2/cm1 (matches Csh), fp32 on scalar + const bool conv2d_use_fp16_shmem = device->coopmat2 || conv2d_use_cm1; + + // shrink CRS if the non-cm1 config still doesn't fit + if (device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_SHMEM_PAD, conv2d_csh_store, conv2d_use_fp16_shmem)) { + GGML_ASSERT(!conv2d_use_cm1); + conv2d_BS.CRS = 8; + if (use_collectives) { + conv2d_BS.CRS = std::min(device->subgroup_size, conv2d_BS.CRS); + } + conv2d_csh_store = 0; + } + + std::array wg_denoms = { conv2d_BS.K, 1, 1 }; + std::vector spec_constants = { conv2d_WG_SIZE, conv2d_BS.K, conv2d_BS.CRS, conv2d_BS.NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD }; + + // cm1 needs a fixed subgroup width to match the WG_SIZE we computed + const uint32_t conv2d_required_subgroup_size = conv2d_use_cm1 ? device->subgroup_size : 0; + +#define CREATE_CONV(name, type_suffix, spv_suffix) \ + for (auto &c : device->pipeline_##name##type_suffix[s]) { \ + const vk_conv2d_pipeline_state &state = c.first; \ + std::vector spec_constants_cpy = spec_constants; \ + spec_constants_cpy.push_back(state.s0); \ + spec_constants_cpy.push_back(state.s1); \ + spec_constants_cpy.push_back(state.p0); \ + spec_constants_cpy.push_back(state.p1); \ + spec_constants_cpy.push_back(state.d0); \ + spec_constants_cpy.push_back(state.d1); \ + spec_constants_cpy.push_back(state.KW); \ + spec_constants_cpy.push_back(state.KH); \ + spec_constants_cpy.push_back(state.aligned); \ + spec_constants_cpy.push_back(conv2d_csh_store); \ + spec_constants_cpy.push_back(conv2d_WM); \ + spec_constants_cpy.push_back(conv2d_WN); \ + ggml_vk_create_pipeline( \ + device, c.second, #name #type_suffix, \ + name##type_suffix##spv_suffix##_len, name##type_suffix##spv_suffix##_data, "main", 3, \ + sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives || conv2d_required_subgroup_size, conv2d_required_subgroup_size); \ + } +#define CREATE_CONVS(spv_suffix) \ + CREATE_CONV(conv2d, _f32, spv_suffix) \ + CREATE_CONV(conv2d, _f16_f32, spv_suffix) \ + CREATE_CONV(conv_transpose_2d, _f32, spv_suffix) \ + CREATE_CONV(conv_transpose_2d, _f16_f32, spv_suffix) +#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + if (device->coopmat2) { + CREATE_CONVS(_cm2) + } else +#endif +#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + if (conv2d_use_cm1) { + CREATE_CONVS(_cm1) + } else +#endif + if (conv2d_UNROLL) { + CREATE_CONVS(_unroll) + } else { + CREATE_CONVS( ) + } +#undef CREATE_CONV +#undef CREATE_CONVS + } + + ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); + + for (uint32_t use_push = 0; use_push < 2; ++use_push) { + for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) { + ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][use_push], "topk_moe_f32_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 4, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); + } + } + + // Drop compile_mutex so other threads can walk while we compile. + compile_lock.unlock(); + + // Compile what we claimed; create_pipeline_func reacquires compile_mutex + // at the end to flip compile_pending/compiled and notify waiters. + if (has_claimed_task) { + auto & task = claimed_task; + ggml_vk_create_pipeline_func(device, task.pipeline, task.spv_size, task.spv_data, + task.entrypoint, task.parameter_count, task.wg_denoms, + task.specialization_constants, task.disable_robustness, + task.require_full_subgroups, task.required_subgroup_size); + } + + // Another thread may be compiling the pipeline we need; block on it here. + if (wait_pipeline) { + std::unique_lock wait_lock(device->compile_mutex); + device->compile_cv.wait(wait_lock, [&] { + return wait_pipeline->compiled.load(); + }); + } +} + +bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type, ggml_type v_type) { + GGML_UNUSED(f32acc); + GGML_UNUSED(v_type); + // Needs to be kept up to date on shader changes + const uint32_t wg_size = params.workgroup_size; + const uint32_t Br = params.block_rows; + const uint32_t Bc = params.block_cols; + + // BF16 uses the fp32 shader (FLOAT_TYPE=float) + const uint32_t float_type_size = (device->fp16 && k_type != GGML_TYPE_BF16) ? sizeof(ggml_fp16_t) : sizeof(float); + + const bool mmq = ggml_vk_fa_scalar_uses_mmq(device, k_type); + + // tmpsh is overestimated slightly + const uint32_t tmpsh = wg_size * sizeof(float); + const uint32_t tmpshv4 = wg_size * 4 * float_type_size; + + const uint32_t masksh = Bc * (Br + 1) * float_type_size; + + uint32_t Qf, kvsh, kblocksh_size; + if (mmq) { + // block_b_cache: int32_t qs[8] + FLOAT_TYPEV2 ds + const uint32_t block_b_size = 8 * sizeof(int32_t) + 2 * float_type_size; + Qf = Br * (hsk / 32) * block_b_size; + + // kvsh uses D = HSV (K goes through kblocksh instead) + kvsh = params.shmem_staging ? Bc * (hsv / 4 + 1) * 4 * float_type_size : 4 * float_type_size; + + // The mixed MMQ shader uses a superset block_a_cache that fits every + // FA-supported quant: int32_t qs[8] + uint32_t qh + FLOAT_TYPEV2 dm. + // Single-scale types leave dm.y unused; non-Q5_* leave qh unused. + const uint32_t block_a_size = 8 * sizeof(int32_t) + sizeof(uint32_t) + 2 * float_type_size; + kblocksh_size = params.shmem_staging ? Bc * (hsk / 32) * block_a_size : block_a_size; + } else { + Qf = Br * (hsk / 4 + 1) * 4 * float_type_size; + + const uint32_t D = std::max(hsk, hsv); + kvsh = params.shmem_staging ? Bc * (D / 4 + 1) * 4 * float_type_size : 4 * float_type_size; + + kblocksh_size = 0; + } + + const uint32_t total_size = tmpsh + tmpshv4 + masksh + Qf + kvsh + kblocksh_size; + const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; + + VK_LOG_DEBUG("ggml_vk_flash_attn_scalar_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", mmq=" << mmq << ", total_size=" << total_size << ", supported=" << supported); + + return supported; +} + +bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type) { + // Needs to be kept up to date on shader changes + const uint32_t Br = params.block_rows; + const uint32_t Bc = params.block_cols; + + const uint32_t MatBr = 16, MatBc = 16; + + const uint32_t row_split = Bc / MatBc; + + const uint32_t hsk_pad = ROUNDUP_POW2(hsk, 16); + const uint32_t hsv_pad = ROUNDUP_POW2(hsv, 16); + + const uint32_t acctype = f32acc ? 4 : 2; + const uint32_t f16vec4 = 8; + + const uint32_t tmpsh = (Bc / MatBc) * sizeof(float); + + const uint32_t qstride = hsk_pad / 4 + 2; + const uint32_t Qf = Br * qstride * f16vec4; + + const uint32_t psh_stride = Br / 4 + 2; + const uint32_t Psh = Bc * psh_stride * f16vec4; + + const uint32_t sfshstride = (hsk <= 128) ? (Br + 8) : Br; + const uint32_t sfsh = Bc * sfshstride * acctype; + + const uint32_t kvshstride = (params.shmem_staging ? std::max(hsk_pad, hsv_pad) : MatBr) / 4 + 2; + const uint32_t vsh_stride = MatBc / 4 * row_split; + const uint32_t ksh = ((kvshstride >= vsh_stride) ? (Bc * kvshstride) : (Bc * vsh_stride)) * f16vec4; + + // BF16 PVMat accumulator is f32 (no bf16 accumulator support), so pvsh is vec4 (16 bytes) + const uint32_t pvsh_elem_size = (k_type == GGML_TYPE_BF16) ? 16u : f16vec4; + const uint32_t osh_stride = params.row_split * MatBr / 4; + const uint32_t pvsh = MatBc * osh_stride * pvsh_elem_size; + + const uint32_t slope = Br * acctype; + + const uint32_t total_size = tmpsh + Qf + Psh + sfsh + ksh + pvsh + slope; + const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; + + VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", f32acc=" << f32acc << ", total_size=" << total_size << ", supported=" << supported); + + return supported; +} + diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-types.h b/ggml/src/ggml-vulkan/ggml-vulkan-types.h new file mode 100644 index 0000000000..75be7fb3a7 --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-types.h @@ -0,0 +1,1341 @@ +#pragma once + +#include "ggml-vulkan.h" + +#include + +#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS) +#include +#include "ggml-cpu.h" +#endif + +#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 + +#if VK_HEADER_VERSION >= 301 +namespace vk::detail { class DispatchLoaderDynamic; } +using vk::detail::DispatchLoaderDynamic; +#else +namespace vk { class DispatchLoaderDynamic; } +using vk::DispatchLoaderDynamic; +#endif + +DispatchLoaderDynamic & ggml_vk_default_dispatcher(); + +#define VULKAN_HPP_DEFAULT_DISPATCHER ggml_vk_default_dispatcher() + +#include + +#ifndef VK_NV_cooperative_matrix_decode_vector +#define VK_NV_cooperative_matrix_decode_vector 1 +#define VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME "VK_NV_cooperative_matrix_decode_vector" +#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV ((VkStructureType)1000689000) +typedef struct VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV { + VkStructureType sType; + void* pNext; + VkBool32 cooperativeMatrixDecodeVector; +} VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV; +#endif + +#if __has_include() +# include +#elif __has_include() +# include +#elif __has_include() +# include +#else + // Fallback to let the compiler throw a standard "file not found" error +# include +#endif + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#if defined(_MSC_VER) +# define NOMINMAX 1 +# include +# define YIELD() YieldProcessor() +#elif defined(__clang__) || defined(__GNUC__) +# if defined(__x86_64__) ||defined(__i386__) +# include +# define YIELD() _mm_pause() +# elif defined(__arm__) || defined(__aarch64__) +# if defined(__clang__) +# include +# define YIELD() __yield() +# else +# define YIELD() asm volatile("yield") +# endif +# endif +#endif + +#if !defined(YIELD) +#define YIELD() +#endif + +#include "ggml-impl.h" + +#include "ggml-backend-impl.h" + +#include "ggml-vulkan-shaders.hpp" + +#if !defined(VK_KHR_shader_bfloat16) + +#define VK_KHR_shader_bfloat16 1 +#define VK_KHR_SHADER_BFLOAT16_SPEC_VERSION 1 +#define VK_KHR_SHADER_BFLOAT16_EXTENSION_NAME "VK_KHR_shader_bfloat16" +#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR ((VkStructureType)1000141000) +#define VK_COMPONENT_TYPE_BFLOAT16_KHR ((VkComponentTypeKHR)1000141000) + +typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR { + VkStructureType sType; + void* pNext; + VkBool32 shaderBFloat16Type; + VkBool32 shaderBFloat16DotProduct; + VkBool32 shaderBFloat16CooperativeMatrix; +} VkPhysicalDeviceShaderBfloat16FeaturesKHR; +#endif + +#if !defined(VK_VALVE_shader_mixed_float_dot_product) +#define VK_VALVE_shader_mixed_float_dot_product 1 +#define VK_VALVE_SHADER_MIXED_FLOAT_DOT_PRODUCT_SPEC_VERSION 1 +#define VK_VALVE_SHADER_MIXED_FLOAT_DOT_PRODUCT_EXTENSION_NAME "VK_VALVE_shader_mixed_float_dot_product" +#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MIXED_FLOAT_DOT_PRODUCT_FEATURES_VALVE ((VkStructureType)1000673000) +typedef struct VkPhysicalDeviceShaderMixedFloatDotProductFeaturesVALVE { + VkStructureType sType; + void* pNext; + VkBool32 shaderMixedFloatDotProductFloat16AccFloat32; + VkBool32 shaderMixedFloatDotProductFloat16AccFloat16; + VkBool32 shaderMixedFloatDotProductBFloat16Acc; + VkBool32 shaderMixedFloatDotProductFloat8AccFloat32; +} VkPhysicalDeviceShaderMixedFloatDotProductFeaturesVALVE; +#endif + +#define ROUNDUP_POW2(M, N) (((M) + (N) - 1) & ~((N) - 1)) + +#define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) + +static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; } + +#define VK_VENDOR_ID_AMD 0x1002 + +#define VK_VENDOR_ID_APPLE 0x106b + +#define VK_VENDOR_ID_INTEL 0x8086 + +#define VK_VENDOR_ID_NVIDIA 0x10de + +#define VK_VENDOR_ID_QUALCOMM 0x5143 + +#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256 + +#define VK_CHECK(err, msg) \ + do { \ + vk::Result err_ = (err); \ + if (err_ != vk::Result::eSuccess) { \ + fprintf(stderr, "ggml_vulkan: %s error %s at %s:%d\n", \ + #err, to_string(err_).c_str(), __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + +#ifdef GGML_VULKAN_DEBUG +#define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl +#else +#define VK_LOG_DEBUG(msg) ((void) 0) +#endif // GGML_VULKAN_DEBUG + +#define MAX_PARAMETER_COUNT 12 + +#define MAX_FUSED_ADDS (MAX_PARAMETER_COUNT - 3) + +struct vk_pipeline_struct; + +typedef std::shared_ptr vk_pipeline; + +struct vk_pipeline_struct { + std::string name; + vk::ShaderModule shader_module; + vk::PipelineLayout layout; + vk::Pipeline pipeline; + uint32_t push_constant_size; + uint32_t parameter_count; + std::array wg_denoms; + uint32_t align; + // true if fields have been set by ggml_vk_create_pipeline + bool initialized {}; + // true while a compile is in flight, used to dedupe concurrent claims. + // Protected by device->compile_mutex. + bool compile_pending {}; + // set to true when the shader has been compiled + std::atomic compiled {}; + // number of registers used, extracted from pipeline executable properties + uint32_t register_count {}; + +#if defined(VK_EXT_shader_64bit_indexing) + bool is_64b_indexing {}; +#endif + // linked list of pipelines for multiple compilation variants. + // currently only used to compile a 64-bit indexing variant. + vk_pipeline next; +}; + +typedef std::weak_ptr vk_pipeline_ref; + +struct vk_matmul_pipeline_struct { + vk_pipeline l, m, s; + vk_pipeline a_l, a_m, a_s; + // Returns true when all unaligned pipelines are null. + // We only check for unaligned variants since one of the unaligned pipelines must exist + // while aligned pipelines are optional + bool is_empty() const { + return l == nullptr && m == nullptr && s == nullptr; + } +}; + +typedef std::shared_ptr vk_matmul_pipeline; + +struct vk_matmul_pipeline2 { + vk_matmul_pipeline2() { + f16acc = std::make_shared(); + f32acc = std::make_shared(); + } + vk_matmul_pipeline f32acc; + vk_matmul_pipeline f16acc; +}; + +struct vk_device_struct; + +typedef std::shared_ptr vk_device; + +typedef std::weak_ptr vk_device_ref; + +struct vk_buffer_struct; + +typedef std::shared_ptr vk_buffer; + +typedef std::weak_ptr vk_buffer_ref; + +struct ggml_backend_vk_buffer_type_context { + std::string name; + vk_device device; +}; + +struct vk_command_buffer { + vk::CommandBuffer buf; + uint64_t use_counter = 0; + bool in_use = false; +}; + +struct vk_queue; + +struct vk_command_pool { + void init(vk_device& device, vk_queue *q_); + void destroy(vk::Device& device); + + vk::CommandPool pool; + // Using deque so the pointers to command buffers + // remain valid even if we add more + std::deque cmd_buffers; + + vk_queue *q; + + size_t buffers_in_use() const { + return std::count_if(cmd_buffers.begin(), cmd_buffers.end(), + [](const auto& cb) { return cb.in_use; }); + } +}; + +struct vk_queue { + uint32_t queue_family_index; + vk::Queue queue; + + vk_command_pool cmd_pool; + + vk::PipelineStageFlags stage_flags; + + bool transfer_only; + + // copy everything except the cmd_pool + void copyFrom(vk_queue &other) { + queue_family_index = other.queue_family_index; + queue = other.queue; + stage_flags = other.stage_flags; + transfer_only = other.transfer_only; + } +}; + +static constexpr uint32_t mul_mat_vec_max_cols = 8; + +static constexpr uint32_t p021_max_gqa_ratio = 8; + +enum vk_device_architecture { + OTHER, + AMD_GCN, + AMD_RDNA1, + AMD_RDNA2, + AMD_RDNA3, + INTEL_XE2, + NVIDIA_PRE_TURING, + NVIDIA_TURING, +}; + +enum vk_conv_shapes { + CONV_SHAPE_128x128, + CONV_SHAPE_64x32, + CONV_SHAPE_32x256, + CONV_SHAPE_64x128, + CONV_SHAPE_COUNT, +}; + +struct vk_conv_block_size { + uint32_t K; + uint32_t NPQ; + uint32_t CRS; +}; + +inline vk_conv_block_size vk_conv_block_sizes[CONV_SHAPE_COUNT] = { + // K NPQ CRS + { 128, 128, 16 }, // CONV_SHAPE_128x128 + { 64, 32, 32 }, // CONV_SHAPE_64x32 + { 32, 256, 16 }, // CONV_SHAPE_32x256 + { 64, 128, 16 }, // CONV_SHAPE_64x128 +}; + +enum dmmv_wg_sizes { + DMMV_WG_SIZE_SUBGROUP, + DMMV_WG_SIZE_LARGE, + DMMV_WG_SIZE_COUNT, +}; + +enum FaCodePath { + FA_SCALAR, + FA_COOPMAT1, + FA_COOPMAT2, +}; + +struct vk_fa_pipeline_state { + uint32_t HSK, HSV; + uint32_t Br, Bc; + uint32_t D_split, row_split; + bool shmem_staging; + FaCodePath path; + uint32_t workgroup_size, subgroup_size; + bool aligned; + bool f32acc; + uint32_t flags; + uint32_t limit_occupancy_shmem; + ggml_type k_type; + ggml_type v_type; + + bool operator<(const vk_fa_pipeline_state &b) const { + return std::tie(HSK, HSV, Br, Bc, D_split, row_split, shmem_staging, path, workgroup_size, subgroup_size, aligned, f32acc, flags, limit_occupancy_shmem, k_type, v_type) < + std::tie(b.HSK, b.HSV, b.Br, b.Bc, b.D_split, b.row_split, b.shmem_staging, b.path, b.workgroup_size, b.subgroup_size, b.aligned, b.f32acc, b.flags, b.limit_occupancy_shmem, b.k_type, b.v_type); + } +}; + +struct vk_conv2d_pipeline_state { + vk_conv2d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t p0, uint32_t p1, uint32_t d0, uint32_t d1, uint32_t KW, uint32_t KH, uint32_t aligned) + : s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), KW(KW), KH(KH), aligned(aligned) {} + + uint32_t s0, s1, p0, p1, d0, d1, KW, KH; + // when set, shader can skip K/CRS/NPQ bounds checks and address clamps + uint32_t aligned; + + bool operator<(const vk_conv2d_pipeline_state &b) const { + return std::tie(s0, s1, p0, p1, d0, d1, KW, KH, aligned) < + std::tie(b.s0, b.s1, b.p0, b.p1, b.d0, b.d1, b.KW, b.KH, b.aligned); + } +}; + +struct vk_solve_tri_pipeline_state { + vk_solve_tri_pipeline_state(uint32_t N, uint32_t K) + : N(N), K(K) {} + + uint32_t N, K; + + bool operator<(const vk_solve_tri_pipeline_state &b) const { + return std::tie(N, K) < + std::tie(b.N, b.K); + } +}; + +enum shader_reduction_mode { + SHADER_REDUCTION_MODE_SHMEM, + SHADER_REDUCTION_MODE_HYBRID, + SHADER_REDUCTION_MODE_SUBGROUP, + SHADER_REDUCTION_MODE_COUNT, +}; + +static constexpr uint32_t num_argsort_pipelines = 11; + +static constexpr uint32_t num_topk_moe_pipelines = 10; + +static constexpr uint32_t num_topk_pipelines = 11; + +static constexpr std::initializer_list topk_moe_early_softmax_norm{ GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT, + GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE, + GGML_OP_SUM_ROWS, GGML_OP_CLAMP, GGML_OP_DIV, + GGML_OP_RESHAPE }; + +static constexpr std::initializer_list topk_moe_sigmoid_norm_bias{ GGML_OP_UNARY, GGML_OP_RESHAPE, GGML_OP_ADD, + GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS, + GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP, + GGML_OP_DIV, GGML_OP_RESHAPE }; + +static constexpr std::initializer_list topk_moe_early_softmax { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT, + GGML_OP_VIEW, GGML_OP_GET_ROWS }; + +static constexpr std::initializer_list topk_moe_late_softmax { GGML_OP_ARGSORT, GGML_OP_VIEW, + GGML_OP_GET_ROWS, GGML_OP_RESHAPE, + GGML_OP_SOFT_MAX, GGML_OP_RESHAPE }; + +static constexpr std::initializer_list snake_pattern { GGML_OP_MUL, GGML_OP_SIN, + GGML_OP_SQR, GGML_OP_MUL, + GGML_OP_ADD }; + +static constexpr std::initializer_list> topk_moe_early_softmax_norm_edges { + { 1, 0, 0 }, // reshape->src[0] == softmax + { 2, 0, 0 }, // argsort->src[0] == softmax + { 3, 0, 2 }, // view->src[0] == argsort + { 4, 0, 1 }, // get_rows->src[0] == reshape + { 4, 1, 3 }, // get_rows->src[1] == view + { 5, 0, 4 }, // reshape->src[0] == get_rows + { 6, 0, 5 }, // sum_rows->src[0] == reshape + { 7, 0, 6 }, // clamp->src[0] == sum_rows + { 8, 0, 5 }, // div->src[0] == reshape + { 8, 1, 7 }, // div->src[1] == clamp + { 9, 0, 8 }, // reshape->src[0] == div +}; + +static constexpr std::initializer_list> topk_moe_sigmoid_norm_bias_edges { + { 1, 0, 0 }, // reshape->src[0] == sigmoid + { 2, 0, 0 }, // add->src[0] == sigmoid + { 3, 0, 2 }, // argsort->src[0] == add + { 4, 0, 3 }, // view->src[0] == argsort + { 5, 0, 1 }, // get_rows->src[0] == reshape + { 5, 1, 4 }, // get_rows->src[1] == view + { 6, 0, 5 }, // reshape->src[0] == get_rows + { 7, 0, 6 }, // sum_rows->src[0] == reshape + { 8, 0, 7 }, // clamp->src[0] == sum_rows + { 9, 0, 6 }, // div->src[0] == reshape + { 9, 1, 8 }, // div->src[1] == clamp + {10, 0, 9 }, // reshape->src[0] == div +}; + +static constexpr std::initializer_list> topk_moe_early_softmax_edges { + { 1, 0, 0 }, // reshape->src[0] == softmax + { 2, 0, 0 }, // argsort->src[0] == softmax + { 3, 0, 2 }, // view->src[0] == argsort + { 4, 0, 1 }, // get_rows->src[0] == reshape + { 4, 1, 3 }, // get_rows->src[1] == view +}; + +static constexpr std::initializer_list> topk_moe_late_softmax_edges { + { 1, 0, 0 }, // view->src[0] == argsort + { 2, 1, 1 }, // get_rows->src[1] == view + { 3, 0, 2 }, // reshape->src[0] == get_rows + { 4, 0, 3 }, // soft_max->src[0] == reshape + { 5, 0, 4 }, // reshape->src[0] == soft_max +}; + +enum topk_moe_mode { + TOPK_MOE_EARLY_SOFTMAX, + TOPK_MOE_EARLY_SOFTMAX_NORM, + TOPK_MOE_LATE_SOFTMAX, + TOPK_MOE_SIGMOID_NORM_BIAS, + TOPK_MOE_COUNT, +}; + +static constexpr std::initializer_list> rope_view_set_rows_edges { + { 1, 0, 0 }, // view->src[0] == rope + { 2, 0, 1 }, // set_rows->src[0] == view +}; + +static constexpr std::initializer_list> rms_norm_mul_rope_view_set_rows_edges { + { 1, 0, 0 }, // mul->src[0] == rms + { 2, 0, 1 }, // rope->src[0] == mul + { 3, 0, 2 }, // view->src[0] == rope + { 4, 0, 3 }, // set_rows->src[0] == view +}; + +class vk_memory_logger; + +void ggml_vk_destroy_buffer(vk_buffer& buf); +void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline); + +struct vk_device_struct { + std::recursive_mutex mutex; + mutable std::shared_mutex pinned_memory_mutex; + + // Guards compile_pending, all_pipelines, and the dynamic pipeline maps + // (flash_attn, fa_mask_opt, solve_tri, conv2d, etc). The actual compile + // runs with no lock held, so different pipelines can compile in parallel. + // Lock order is device->mutex -> compile_mutex, never the reverse. + std::mutex compile_mutex; + std::condition_variable compile_cv; + + vk::PhysicalDevice physical_device; + vk::PhysicalDeviceProperties properties; + std::string name; + uint64_t max_memory_allocation_size; + uint64_t max_buffer_size; + uint64_t suballocation_block_size; + uint64_t min_imported_host_pointer_alignment; + bool external_memory_host {}; + bool fp16; + bool bf16; + bool pipeline_robustness; + bool memory_priority; + vk::Device device; + uint32_t vendor_id; + vk::DriverId driver_id; + vk_device_architecture architecture; + vk_queue compute_queue; + vk_queue transfer_queue; + bool single_queue; + bool support_async; + bool async_use_transfer_queue; + uint32_t subgroup_size; + uint32_t subgroup_size_log2; + uint32_t shader_core_count; + bool uma; + bool prefer_host_memory; + bool float_controls_rte_fp16; + bool subgroup_basic; + bool subgroup_arithmetic; + bool subgroup_shuffle; + bool subgroup_ballot; + bool subgroup_clustered; + bool subgroup_vote; + bool multi_add; + bool shader_int64; + bool buffer_device_address; + bool vulkan_memory_model; + + bool add_rms_fusion; + uint32_t partials_binding_alignment; + + bool shader_64b_indexing; + + bool integer_dot_product; + // 0: default, 1: force mmvq, -1: disable mmvq + int32_t mmvq_mode; + + bool subgroup_size_control; + uint32_t subgroup_min_size; + uint32_t subgroup_max_size; + bool subgroup_require_full_support; + + // floor(log2(maxComputeWorkGroupInvocations)) + uint32_t max_workgroup_size_log2 {}; + + bool coopmat_support; + bool coopmat_acc_f32_support {}; + bool coopmat_acc_f16_support {}; + bool coopmat_bf16_support {}; + bool coopmat_support_16x16x16_f16acc {}; + bool coopmat_support_16x16x16_f32acc {}; + bool coopmat1_fa_support {}; + uint32_t coopmat_m; + uint32_t coopmat_n; + uint32_t coopmat_k; + + bool coopmat_int_support; + uint32_t coopmat_int_m; + uint32_t coopmat_int_n; + uint32_t coopmat_int_k; + + bool coopmat2; + bool coopmat2_bf16_support {}; + bool coopmat2_decode_vector; + + bool dot2_f16 {}; + + bool pipeline_executable_properties_support {}; + + size_t idx; + + bool mul_mat_l[GGML_TYPE_COUNT]; + bool mul_mat_m[GGML_TYPE_COUNT]; + bool mul_mat_s[GGML_TYPE_COUNT]; + bool mul_mat_id_l[GGML_TYPE_COUNT]; + bool mul_mat_id_m[GGML_TYPE_COUNT]; + bool mul_mat_id_s[GGML_TYPE_COUNT]; + + // Separate flags for the q8_1 (integer dot) mmq path, whose shader uses + // a different shared-memory layout than the float matmul shaders. + bool mul_mat_l_int[GGML_TYPE_COUNT]; + bool mul_mat_m_int[GGML_TYPE_COUNT]; + bool mul_mat_s_int[GGML_TYPE_COUNT]; + bool mul_mat_id_l_int[GGML_TYPE_COUNT]; + bool mul_mat_id_m_int[GGML_TYPE_COUNT]; + bool mul_mat_id_s_int[GGML_TYPE_COUNT]; + + vk::DescriptorSetLayout dsl; + + vk_matmul_pipeline pipeline_matmul_f32 {}; + vk_matmul_pipeline pipeline_matmul_f32_f16 {}; + vk_matmul_pipeline pipeline_matmul_bf16 {}; + vk_matmul_pipeline2 pipeline_matmul_f16; + vk_matmul_pipeline2 pipeline_matmul_f16_f32; + + vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT]; + vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT]; + vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_COUNT]; + + vk_matmul_pipeline pipeline_matmul_id_f32 {}; + vk_matmul_pipeline pipeline_matmul_id_bf16 {}; + vk_matmul_pipeline2 pipeline_matmul_id_f16; + vk_matmul_pipeline2 pipeline_matmul_id_f16_f32; + + vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT]; + vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_COUNT]; + + vk_pipeline pipeline_matmul_split_k_reduce; + vk_pipeline pipeline_quantize_q8_1_x4; + + vk_pipeline pipeline_dequant[GGML_TYPE_COUNT]; + vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT][mul_mat_vec_max_cols]; + vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT][mul_mat_vec_max_cols]; + vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT]; + + vk_pipeline pipeline_dequant_mul_mat_vec_q8_1_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT][mul_mat_vec_max_cols]; + vk_pipeline pipeline_dequant_mul_mat_vec_id_q8_1_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT]; + + vk_pipeline pipeline_mul_mat_vec_p021_f16_f32[p021_max_gqa_ratio]; + vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; + vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT]; + vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT]; + vk_pipeline pipeline_acc_f32; + vk_pipeline pipeline_set_f32; + + // [src0 0=fp32,1=fp16][src1 0=fp32,1=fp16][dst 0=fp32,1=fp16] + vk_pipeline pipeline_add[2][2][2]; + vk_pipeline pipeline_add_norepeat[2][2][2]; + vk_pipeline pipeline_sub[2][2][2]; + vk_pipeline pipeline_sub_norepeat[2][2][2]; + vk_pipeline pipeline_mul[2][2][2]; + vk_pipeline pipeline_mul_norepeat[2][2][2]; + vk_pipeline pipeline_div[2][2][2]; + vk_pipeline pipeline_div_norepeat[2][2][2]; + vk_pipeline pipeline_add_rms[2][2][2]; + vk_pipeline pipeline_add_rms_norepeat[2][2][2]; + + // indexed by num_additional_fused_ops == num_adds - 1 + vk_pipeline pipeline_multi_add[MAX_FUSED_ADDS]; + vk_pipeline pipeline_multi_add_rms[MAX_FUSED_ADDS]; + + vk_pipeline pipeline_add_id_f32; + + vk_pipeline pipeline_concat_i8, pipeline_concat_i16, pipeline_concat_i32, pipeline_concat_i64; + vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bicubic_f32, pipeline_upscale_bilinear_antialias_f32; + vk_pipeline pipeline_scale_f32; + vk_pipeline pipeline_sqr_f32; + vk_pipeline pipeline_sqrt_f32; + vk_pipeline pipeline_sin_f32; + vk_pipeline pipeline_cos_f32; + vk_pipeline pipeline_log[2]; + vk_pipeline pipeline_tri[2]; + vk_pipeline pipeline_diag[2]; + vk_pipeline pipeline_clamp_f32; + vk_pipeline pipeline_pad_f32; + vk_pipeline pipeline_roll_f32; + vk_pipeline pipeline_repeat_i32, pipeline_repeat_back_f32; + vk_pipeline pipeline_repeat_i16; + vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16, pipeline_cpy_bf16_f32, pipeline_cpy_f32_i32, pipeline_cpy_i32_f32; + vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_bf16_f32, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32; + vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT]; + vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT]; + vk_pipeline pipeline_cpy_transpose_16, pipeline_cpy_transpose_32; + vk_pipeline pipeline_set_rows_i32[GGML_TYPE_COUNT]; + vk_pipeline pipeline_set_rows_i64[GGML_TYPE_COUNT]; + vk_pipeline pipeline_norm_f32; + vk_pipeline pipeline_group_norm_f32; + vk_pipeline pipeline_rms_norm_f32; + vk_pipeline pipeline_rms_norm_mul_f32; + vk_pipeline pipeline_rms_norm_partials_f32; + vk_pipeline pipeline_rms_norm_mul_partials_f32; + vk_pipeline pipeline_rms_norm_mul_rope_f32_f32; + vk_pipeline pipeline_rms_norm_mul_rope_f32_f16; + vk_pipeline pipeline_rms_norm_back_f32; + vk_pipeline pipeline_l2_norm_f32; + + // [src/dst 0=fp32,1=fp16] + vk_pipeline pipeline_exp[2]; + vk_pipeline pipeline_expm1[2]; + vk_pipeline pipeline_elu[2]; + vk_pipeline pipeline_gelu[2]; + vk_pipeline pipeline_gelu_erf[2]; + vk_pipeline pipeline_gelu_quick[2]; + vk_pipeline pipeline_silu[2]; + vk_pipeline pipeline_relu[2]; + vk_pipeline pipeline_xielu[2]; + vk_pipeline pipeline_neg[2]; + vk_pipeline pipeline_tanh[2]; + vk_pipeline pipeline_sigmoid[2]; + vk_pipeline pipeline_hardsigmoid[2]; + vk_pipeline pipeline_hardswish[2]; + vk_pipeline pipeline_abs[2]; + vk_pipeline pipeline_softplus[2]; + vk_pipeline pipeline_step[2]; + vk_pipeline pipeline_round[2]; + vk_pipeline pipeline_ceil[2]; + vk_pipeline pipeline_floor[2]; + vk_pipeline pipeline_trunc[2]; + vk_pipeline pipeline_sgn[2]; + + vk_pipeline pipeline_add1_f16_f16; + vk_pipeline pipeline_add1_f16_f32; + vk_pipeline pipeline_add1_f32_f32; + + vk_pipeline pipeline_arange_f32; + + vk_pipeline pipeline_fill_f32; + vk_pipeline pipeline_fill_f16; + + vk_pipeline pipeline_geglu[2]; + vk_pipeline pipeline_reglu[2]; + vk_pipeline pipeline_swiglu[2]; + vk_pipeline pipeline_swiglu_oai[2]; + vk_pipeline pipeline_geglu_erf[2]; + vk_pipeline pipeline_geglu_quick[2]; + + vk_pipeline pipeline_leaky_relu_f32; + vk_pipeline pipeline_silu_back_f32; + vk_pipeline pipeline_diag_mask_inf_f32; + vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16; + vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512; + vk_pipeline pipeline_soft_max_back_f32; + + vk_pipeline pipeline_soft_max_large1_f32, pipeline_soft_max_large1_f32_f16; + vk_pipeline pipeline_soft_max_large2_f32, pipeline_soft_max_large2_f32_f16; + vk_pipeline pipeline_soft_max_large3_f32, pipeline_soft_max_large3_f32_f16; + + vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16; + vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16; + vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16, pipeline_rope_multi_f32_f16; + vk_pipeline pipeline_rope_vision_f32, pipeline_rope_vision_f16; + vk_pipeline pipeline_argsort_f32[num_argsort_pipelines]; + vk_pipeline pipeline_argsort_large_f32[num_argsort_pipelines]; + vk_pipeline pipeline_topk_f32[num_topk_pipelines]; + vk_pipeline pipeline_sum_rows_f32; + vk_pipeline pipeline_fwht_f32[4]; + vk_pipeline pipeline_cumsum_f32; + vk_pipeline pipeline_cumsum_small_f32; + vk_pipeline pipeline_cumsum_multipass1_f32; + vk_pipeline pipeline_cumsum_multipass2_f32; + vk_pipeline pipeline_argmax_f32; + vk_pipeline pipeline_count_equal_i32; + std::map pipeline_solve_tri_f32; + vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16; + vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16; + vk_pipeline pipeline_timestep_embedding_f32; + vk_pipeline pipeline_conv_transpose_1d_f32; + vk_pipeline pipeline_col2im_1d_f32; + vk_pipeline pipeline_col2im_1d_f16; + vk_pipeline pipeline_col2im_1d_bf16; + vk_pipeline pipeline_snake_f32; + vk_pipeline pipeline_snake_f16; + vk_pipeline pipeline_snake_bf16; + vk_pipeline pipeline_pool2d_f32; + vk_pipeline pipeline_rwkv_wkv6_f32; + vk_pipeline pipeline_rwkv_wkv7_f32; + // [size_idx][kda] where size_idx: 0=d16, 1=d32, 2=d64, 3=d128 + vk_pipeline pipeline_gated_delta_net[4][2]; + vk_pipeline pipeline_ssm_scan_f32_d128; + vk_pipeline pipeline_ssm_scan_f32_d256; + vk_pipeline pipeline_ssm_conv_f32; + vk_pipeline pipeline_ssm_conv_silu_f32; + vk_pipeline pipeline_ssm_conv_bias_silu_f32; + vk_pipeline pipeline_opt_step_adamw_f32; + vk_pipeline pipeline_opt_step_sgd_f32; + std::map pipeline_conv2d_f32[CONV_SHAPE_COUNT]; + std::map pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT]; + std::map pipeline_conv_transpose_2d_f32[CONV_SHAPE_COUNT]; + std::map pipeline_conv_transpose_2d_f16_f32[CONV_SHAPE_COUNT]; + vk_pipeline pipeline_conv2d_dw_whcn_f32, pipeline_conv2d_dw_whcn_f16_f32; + vk_pipeline pipeline_conv2d_dw_cwhn_f32, pipeline_conv2d_dw_cwhn_f16_f32; + + std::map pipeline_flash_attn_f32_f16; + + std::map, vk_pipeline> pipeline_fa_mask_opt; + + vk_pipeline pipeline_flash_attn_split_k_reduce; + vk_pipeline pipeline_count_experts; + + // [2] is for whether to take n_experts from spec constant (0) or push constant (1) + vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][2]; + + std::vector all_pipelines; + + std::vector> pinned_memory; + + vk::Fence fence; + vk_buffer sync_staging; + + ggml_backend_buffer_type buffer_type; + + bool disable_fusion; + bool disable_host_visible_vidmem; + bool allow_sysmem_fallback; + bool disable_graph_optimize; + + std::unique_ptr memory_logger; + + ~vk_device_struct() { + VK_LOG_DEBUG("destroy device " << name); + + device.destroyFence(fence); + + ggml_vk_destroy_buffer(sync_staging); + + compute_queue.cmd_pool.destroy(device); + transfer_queue.cmd_pool.destroy(device); + + for (auto& pipeline : all_pipelines) { + if (pipeline.expired()) { + continue; + } + + vk_pipeline pl = pipeline.lock(); + ggml_vk_destroy_pipeline(device, pl); + } + all_pipelines.clear(); + + device.destroyDescriptorSetLayout(dsl); + + device.destroy(); + } +}; + +inline void vk_command_pool::init(vk_device& device, vk_queue *q_) { + cmd_buffers.clear(); + q = q_; + + vk::CommandPoolCreateInfo command_pool_create_info( + vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT), + q->queue_family_index); + pool = device->device.createCommandPool(command_pool_create_info); +} + +inline void vk_command_pool::destroy(vk::Device& device) { + device.destroyCommandPool(pool); + pool = nullptr; + cmd_buffers.clear(); +} + +struct vk_buffer_struct { + vk::Buffer buffer = VK_NULL_HANDLE; + vk::DeviceMemory device_memory = VK_NULL_HANDLE; + vk::MemoryPropertyFlags memory_property_flags; + void * ptr; + size_t size = 0; + vk::DeviceAddress bda_addr {}; + + vk_device device; + + ~vk_buffer_struct() { + if (size == 0) { + return; + } + VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")"); + + device->device.freeMemory(device_memory); + device->device.destroyBuffer(buffer); + } +}; + +struct vk_subbuffer { + vk_buffer buffer; + uint64_t offset; + uint64_t size; + + operator vk::DescriptorBufferInfo() const { + return { buffer->buffer, offset, size }; + } +}; + +struct vk_semaphore { + vk::Semaphore s; + uint64_t value; +}; + +struct vk_event { + std::vector events_free; // Events available for reuse + std::vector events_submitted; // Events that are fully submitted and can be reused on next synchronize + vk::Event event; + bool has_event; + + vk_semaphore tl_semaphore; + vk_command_buffer* cmd_buffer = nullptr; + uint64_t cmd_buffer_use_counter = 0; +}; + +struct vk_submission { + vk_command_buffer* buffer = nullptr; + std::vector wait_semaphores; + std::vector signal_semaphores; +}; + +typedef std::vector vk_sequence; + +#define MAT_VEC_FUSION_FLAGS_BIAS0 0x1 + +#define MAT_VEC_FUSION_FLAGS_BIAS1 0x2 + +#define MAT_VEC_FUSION_FLAGS_SCALE0 0x4 + +#define MAT_VEC_FUSION_FLAGS_SCALE1 0x8 + +struct vk_staging_memcpy { + vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {} + + void * dst; + const void * src; + size_t n; +}; + +struct vk_staging_memset { + vk_staging_memset(void * _dst, uint32_t _val, size_t _n) : dst(_dst), val(_val), n(_n) {} + + void * dst; + uint32_t val; + size_t n; +}; + +struct vk_context_struct { + vk_submission * s; + std::vector seqs; + + int exit_tensor_idx; + + std::vector in_memcpys; + std::vector out_memcpys; + std::vector memsets; + + vk_command_pool * p {}; +}; + +typedef std::shared_ptr vk_context; + +typedef std::weak_ptr vk_context_ref; + +struct ggml_vk_garbage_collector { + std::vector tl_semaphores; + std::vector semaphores; + std::vector events; + std::vector contexts; +}; + +#define VK_LOG_MEMORY(msg) if (vk_memory_logger_enabled) { std::cerr << "ggml_vulkan memory: " << msg << std::endl; } + +static std::string format_size(size_t size) { + const size_t kib = 1024; + const size_t mib = kib * 1024; + const size_t gib = mib * 1024; + + std::ostringstream oss; + oss << std::fixed << std::setprecision(2); + + if (size >= gib) { + oss << static_cast(size) / gib << " GiB"; + } else if (size >= mib) { + oss << static_cast(size) / mib << " MiB"; + } else if (size >= kib) { + oss << static_cast(size) / kib << " KiB"; + } else { + oss << size << " B"; + } + + return oss.str(); +} + +class vk_memory_logger { +public: + vk_memory_logger(): total_device(0), total_host(0) {} + void log_allocation(vk_buffer_ref buf_ref, size_t size); + void log_deallocation(vk_buffer_ref buf_ref); + +private: + std::map allocations; // Track allocations + size_t total_device; + size_t total_host; + static std::mutex log_mutex; +}; + +inline std::mutex vk_memory_logger::log_mutex; + +extern uint32_t vk_perf_logger_frequency; + +class vk_perf_logger { + public: + void print_timings(bool force = false) { + if (timings.empty()) { + return; + } + print_count++; + if ((print_count % vk_perf_logger_frequency) != 0 && !force) { + return; + } + print_count = 0; + uint64_t total_all_op_times = 0; + std::cerr << "----------------\nVulkan Timings:" << std::endl; + for (const auto & t : timings) { + uint64_t total_op_times = 0; + for (const auto & time : t.second) { + total_op_times += time; + } + std::cerr << t.first << ": " << t.second.size() << " x " << (total_op_times / t.second.size() / 1000.0) + << " us = " << (total_op_times / 1000.0) << " us"; + + // If we have as many flops entries as timing entries for the op, then compute and log the flops/S. + auto it = flops.find(t.first); + if (it != flops.end() && (it->second).size() == t.second.size()) { + uint64_t total_op_flops = 0; + for (const auto & elem : it->second) { + total_op_flops += elem; + } + std::cerr << " (" + << (double(total_op_flops) / (1000.0 * 1000.0 * 1000.0)) / + (double(total_op_times) / (1000.0 * 1000.0 * 1000.0)) + << " GFLOPS/s)"; + } + + total_all_op_times += total_op_times; + + std::cerr << std::endl; + } + + if (timings.size() > 0) { + std::cerr << "Total time: " << total_all_op_times / 1000.0 << " us." << std::endl; + } + + timings.clear(); + flops.clear(); + } + + std::string get_node_fusion_name(const ggml_tensor * node, const char *fusion_name, uint64_t *n_flops) { + *n_flops = 0; + std::string fusion_str; + if (fusion_name) { + fusion_str = fusion_name + std::string(" "); + } + if (node->op == GGML_OP_UNARY) { + return fusion_str + ggml_unary_op_name(ggml_get_unary_op(node)); + } + if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) { + const uint64_t m = node->ne[0]; + const uint64_t n = node->ne[1]; + const uint64_t k = node->src[1]->ne[0]; + const uint64_t batch = node->ne[2] * node->ne[3]; + std::string name = ggml_op_name(node->op); + if ((node->op == GGML_OP_MUL_MAT && n <= mul_mat_vec_max_cols) || + (node->op == GGML_OP_MUL_MAT_ID && node->src[2]->ne[1] == 1)) { + name += "_VEC"; + } + name += " "; + name += ggml_type_name(node->src[0]->type); + name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k); + if (node->op == GGML_OP_MUL_MAT_ID) { + name += " n_expert=" + std::to_string(node->src[0]->ne[2]); + } + if (batch > 1) { + name += " batch=" + std::to_string(batch); + } + name = fusion_str + name; + *n_flops = m * n * (k + (k - 1)) * batch; + return name; + } + if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) { + std::string name = ggml_op_name(node->op); + ggml_tensor * knl = node->src[0]; + uint64_t OW = node->ne[0]; + uint64_t OH = node->ne[1]; + uint64_t N = node->ne[3]; + uint64_t Cout = node->ne[2]; + uint64_t KW = knl->ne[0]; + uint64_t KH = knl->ne[1]; + uint64_t Cin = node->src[1]->ne[2]; + // KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ + uint64_t size_M = Cout; + uint64_t size_K = Cin * KW * KH; + uint64_t size_N = N * OW * OH; + *n_flops = size_M * size_N * (size_K + (size_K - 1)); + name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) + + ", N=N*OW*OH=" + std::to_string(size_N); + name = fusion_str + name; + return name; + } + if (node->op == GGML_OP_RMS_NORM) { + std::string name = ggml_op_name(node->op); + name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")"; + name = fusion_str + name; + return name; + } + if (node->op == GGML_OP_FLASH_ATTN_EXT) { + const ggml_tensor * dst = node; + const ggml_tensor * q = node->src[0]; + const ggml_tensor * k = node->src[1]; + const ggml_tensor * v = node->src[2]; + const ggml_tensor * m = node->src[3]; + std::stringstream name; + name << fusion_str; + name << ggml_op_name(node->op) << + " dst(" << dst->ne[0] << "," << dst->ne[1] << "," << dst->ne[2] << "," << dst->ne[3] << "), " << + " q(" << q->ne[0] << "," << q->ne[1] << "," << q->ne[2] << "," << q->ne[3] << "), " << + " k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " << + " v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " << + " m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")"; + *n_flops = 2ull * q->ne[1] * q->ne[2] * (k->ne[0] + v->ne[0]) * k->ne[1] * q->ne[3]; + return name.str(); + } + if (node->op == GGML_OP_TOP_K) { + std::stringstream name; + name << fusion_str; + name << ggml_op_name(node->op) << + " K=" << node->ne[0] << + " (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")"; + return name.str(); + } + return fusion_str + ggml_op_name(node->op); + } + + void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) { + uint64_t n_flops; + std::string name = get_node_fusion_name(node, fusion_name, &n_flops); + if (n_flops) { + flops[name].push_back(n_flops); + } + timings[name].push_back(time); + } + + void log_timing(const std::vector &nodes, const std::vector &names, uint64_t time) { + uint64_t total_flops = 0; + std::string name; + for (size_t n = 0; n < nodes.size(); ++n) { + uint64_t n_flops = 0; + name += get_node_fusion_name(nodes[n], names[n], &n_flops); + total_flops += n_flops; + + if (n != nodes.size() - 1) { + name += ", "; + } + } + if (total_flops) { + flops[name].push_back(total_flops); + } + timings[name].push_back(time); + } + + private: + std::map> timings; + std::map> flops; + uint32_t print_count {}; +}; + +struct ggml_backend_vk_context { + std::string name; + + vk_device device; + + size_t semaphore_idx, event_idx; + ggml_vk_garbage_collector gc; + size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k, prealloc_size_add_rms_partials, prealloc_size_add_rms_partials_offset; + vk_buffer prealloc_x, prealloc_y, prealloc_split_k, prealloc_add_rms_partials, sync_staging; + vk::Fence fence, almost_ready_fence; + bool submit_pending {}; + bool almost_ready_fence_pending {}; + // Set before op_add and unset after op_rms_norm to indicate that the add should + // write partial sums to accumulate the square of the vector components + bool do_add_rms_partials_offset_calculation; + bool do_add_rms_partials; + + uint64_t last_total_mul_mat_bytes {}; + + // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert. + vk_pipeline_struct * prealloc_y_last_pipeline_used {}; + const ggml_tensor * prealloc_y_last_tensor_used {}; + // True when prealloc_y holds the padded fp16 layout used by the coopmat2 B decode-vector callback. + // If false, then it's contiguous. + bool prealloc_y_last_decode_vector_staging {}; + + // Track which nodes have been used since the last sync, and whether they were written to + std::vector unsynced_nodes_written; + std::vector unsynced_nodes_read; + // Track which prealloc buffers have pending reads that need to be synchronized. + // These are checked before writing to the buffer (and call ggml_vk_sync_buffers if set), + // and set to true after the buffer contents are consumed. + bool prealloc_x_need_sync, prealloc_y_need_sync, prealloc_split_k_need_sync; + + vk_context_ref compute_ctx; + + vk_context_ref transfer_ctx; + vk_semaphore transfer_semaphore; + uint64_t transfer_semaphore_last_submitted {}; + + std::vector tensor_ctxs; + + std::vector descriptor_pools; + std::vector descriptor_sets; + uint32_t descriptor_set_idx {}; + uint32_t pipeline_descriptor_set_requirements {}; + + vk_command_pool compute_cmd_pool; + vk_command_pool transfer_cmd_pool; + + // number of additional consecutive nodes that are being fused with the + // node currently being processed + int num_additional_fused_ops {}; + // Bitmask of which fused ops need to write an intermediate value to memory. + // Bit 'i' means nodes[start_of_fusion + i] writes to memory. + // If there's no fusion, bit 0 is still set. + int fused_ops_write_mask {}; + topk_moe_mode fused_topk_moe_mode {}; + bool fused_topk_moe_scale {}; + + // for GGML_VK_PERF_LOGGER + std::unique_ptr perf_logger; + vk::QueryPool query_pool; + std::vector query_fusion_names; + std::vector query_fusion_node_count; + std::vector query_nodes; + std::vector query_node_idx; + int32_t num_queries {}; + int32_t query_idx {}; +}; + +struct ggml_backend_vk_buffer_context { + vk_device_ref device; + vk_buffer dev_buffer; + std::string name; + + ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) : + device(device), + dev_buffer(dev_buffer), + name(name) { + } + + ~ggml_backend_vk_buffer_context() { + ggml_vk_destroy_buffer(dev_buffer); + } +}; + +struct vk_instance_t { + vk::Instance instance; + + bool debug_utils_support = false; // VK_EXT_debug_utils enabled + PFN_vkSetDebugUtilsObjectNameEXT pfn_vkSetDebugUtilsObjectNameEXT = {}; + PFN_vkQueueBeginDebugUtilsLabelEXT pfn_vkQueueBeginDebugUtilsLabelEXT = {}; + PFN_vkQueueEndDebugUtilsLabelEXT pfn_vkQueueEndDebugUtilsLabelEXT = {}; + PFN_vkCmdBeginDebugUtilsLabelEXT pfn_vkCmdBeginDebugUtilsLabelEXT = {}; + PFN_vkCmdEndDebugUtilsLabelEXT pfn_vkCmdEndDebugUtilsLabelEXT = {}; + PFN_vkCmdInsertDebugUtilsLabelEXT pfn_vkCmdInsertDebugUtilsLabelEXT = {}; + + std::vector device_indices; + std::vector device_supports_membudget; + vk_device devices[GGML_VK_MAX_DEVICES]; +}; + +typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +static constexpr uint32_t kSpvOpCooperativeMatrixLoadTensorNV = 5367; + +static constexpr uint32_t kSpvCapabilityCooperativeMatrixDecodeVectorNV = 5447; + +static constexpr uint32_t kSpvTensorAddressingDecodeVectorFuncBit = 0x4; + +struct vk_fa_tuning_params { + FaCodePath path; + uint32_t workgroup_size; + uint32_t subgroup_size; + uint32_t block_rows; + uint32_t block_cols; + uint32_t d_split; + uint32_t row_split; + bool shmem_staging; + bool disable_subgroups; + uint32_t limit_occupancy_shmem; + + void print() const { + std::cerr << "path=" << path << " workgroup_size=" << workgroup_size << " subgroup_size=" << subgroup_size << + " block_rows=" << block_rows << " block_cols=" << block_cols << " d_split=" << d_split << + " row_split=" << row_split << " shmem_staging=" << shmem_staging << " disable_subgroups=" << disable_subgroups << + " limit_occupancy_shmem=" << limit_occupancy_shmem << std::endl; + } +}; + +struct GpuPipelineConfig { + // GPU architecture identifier. + // Example: vk_device_architecture::AMD_GCN + vk_device_architecture arch; + + // Mapping of pipeline names to their specific subgroup sizes. + // Example: {"soft_max_f32", 64} + std::unordered_map pipelines; + + // Default subgroup size for this GPU. + // Defaults to 0 if not explicitly provided. + uint32_t default_subgroup_size = 0; +}; + +static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32; + +struct CompileTask { + vk_pipeline pipeline; + size_t spv_size; + const void * spv_data; + std::string entrypoint; + uint32_t parameter_count; + std::array wg_denoms; + std::vector specialization_constants; + bool disable_robustness; + bool require_full_subgroups; + uint32_t required_subgroup_size; +}; + +#define UNUSED GGML_UNUSED + +struct ggml_backend_vk_device_context { + size_t device; + std::string name; + std::string description; + bool is_integrated_gpu; + std::string pci_bus_id; + int op_offload_min_batch_size; +}; + diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp deleted file mode 100644 index 9a36b45de8..0000000000 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ /dev/null @@ -1,18393 +0,0 @@ -#include "ggml-vulkan.h" -#include -#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS) -#include -#include "ggml-cpu.h" -#endif - -// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers- -#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 -// We use VULKAN_HPP_DEFAULT_DISPATCHER, but not VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE -// to avoid conflicts with applications or other libraries who might use it. -#if VK_HEADER_VERSION >= 301 -namespace vk::detail { class DispatchLoaderDynamic; } -using vk::detail::DispatchLoaderDynamic; -#else -namespace vk { class DispatchLoaderDynamic; } -using vk::DispatchLoaderDynamic; -#endif -DispatchLoaderDynamic & ggml_vk_default_dispatcher(); -#define VULKAN_HPP_DEFAULT_DISPATCHER ggml_vk_default_dispatcher() - -#include - -// Fallback definitions for VK_NV_cooperative_matrix_decode_vector in case the -// installed Vulkan headers predate the extension. -#ifndef VK_NV_cooperative_matrix_decode_vector -#define VK_NV_cooperative_matrix_decode_vector 1 -#define VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME "VK_NV_cooperative_matrix_decode_vector" -#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV ((VkStructureType)1000689000) -typedef struct VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV { - VkStructureType sType; - void* pNext; - VkBool32 cooperativeMatrixDecodeVector; -} VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV; -#endif - -// SPIR-V Headers: different SDK installations expose different include paths. -// LunarG Vulkan SDK on Windows typically provides . -// Linux packages, MSYS2 and MinGW often use the Khronos layout . -#if __has_include() -# include -#elif __has_include() -# include -#elif __has_include() -# include -#else - // Fallback to let the compiler throw a standard "file not found" error -# include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -# define NOMINMAX 1 -# include -# define YIELD() YieldProcessor() -#elif defined(__clang__) || defined(__GNUC__) -# if defined(__x86_64__) ||defined(__i386__) -# include -# define YIELD() _mm_pause() -# elif defined(__arm__) || defined(__aarch64__) -# if defined(__clang__) -# include -# define YIELD() __yield() -# else -# define YIELD() asm volatile("yield") -# endif -# endif -#endif - -#if !defined(YIELD) -#define YIELD() -#endif - -#include "ggml-impl.h" -#include "ggml-backend-impl.h" - -#include "ggml-vulkan-shaders.hpp" - -// remove this once it's more widely available in the SDK -#if !defined(VK_KHR_shader_bfloat16) - -#define VK_KHR_shader_bfloat16 1 -#define VK_KHR_SHADER_BFLOAT16_SPEC_VERSION 1 -#define VK_KHR_SHADER_BFLOAT16_EXTENSION_NAME "VK_KHR_shader_bfloat16" -#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR ((VkStructureType)1000141000) -#define VK_COMPONENT_TYPE_BFLOAT16_KHR ((VkComponentTypeKHR)1000141000) - -typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR { - VkStructureType sType; - void* pNext; - VkBool32 shaderBFloat16Type; - VkBool32 shaderBFloat16DotProduct; - VkBool32 shaderBFloat16CooperativeMatrix; -} VkPhysicalDeviceShaderBfloat16FeaturesKHR; -#endif - -#if !defined(VK_VALVE_shader_mixed_float_dot_product) -#define VK_VALVE_shader_mixed_float_dot_product 1 -#define VK_VALVE_SHADER_MIXED_FLOAT_DOT_PRODUCT_SPEC_VERSION 1 -#define VK_VALVE_SHADER_MIXED_FLOAT_DOT_PRODUCT_EXTENSION_NAME "VK_VALVE_shader_mixed_float_dot_product" -#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MIXED_FLOAT_DOT_PRODUCT_FEATURES_VALVE ((VkStructureType)1000673000) -typedef struct VkPhysicalDeviceShaderMixedFloatDotProductFeaturesVALVE { - VkStructureType sType; - void* pNext; - VkBool32 shaderMixedFloatDotProductFloat16AccFloat32; - VkBool32 shaderMixedFloatDotProductFloat16AccFloat16; - VkBool32 shaderMixedFloatDotProductBFloat16Acc; - VkBool32 shaderMixedFloatDotProductFloat8AccFloat32; -} VkPhysicalDeviceShaderMixedFloatDotProductFeaturesVALVE; -#endif - -#define ROUNDUP_POW2(M, N) (((M) + (N) - 1) & ~((N) - 1)) -#define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) -static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; } - -#define VK_VENDOR_ID_AMD 0x1002 -#define VK_VENDOR_ID_APPLE 0x106b -#define VK_VENDOR_ID_INTEL 0x8086 -#define VK_VENDOR_ID_NVIDIA 0x10de -#define VK_VENDOR_ID_QUALCOMM 0x5143 - -#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256 - -#define VK_CHECK(err, msg) \ - do { \ - vk::Result err_ = (err); \ - if (err_ != vk::Result::eSuccess) { \ - fprintf(stderr, "ggml_vulkan: %s error %s at %s:%d\n", \ - #err, to_string(err_).c_str(), __FILE__, __LINE__); \ - exit(1); \ - } \ - } while (0) - -#ifdef GGML_VULKAN_DEBUG -#define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl -#else -#define VK_LOG_DEBUG(msg) ((void) 0) -#endif // GGML_VULKAN_DEBUG - -struct ggml_backend_vk_context; - -#define MAX_PARAMETER_COUNT 12 -// Max number of adds that can be fused without exceeding MAX_PARAMETER_COUNT. -#define MAX_FUSED_ADDS (MAX_PARAMETER_COUNT - 3) - -typedef std::shared_ptr vk_pipeline; - -struct vk_pipeline_struct { - std::string name; - vk::ShaderModule shader_module; - vk::PipelineLayout layout; - vk::Pipeline pipeline; - uint32_t push_constant_size; - uint32_t parameter_count; - std::array wg_denoms; - uint32_t align; - // true if fields have been set by ggml_vk_create_pipeline - bool initialized {}; - // true while a compile is in flight, used to dedupe concurrent claims. - // Protected by device->compile_mutex. - bool compile_pending {}; - // set to true when the shader has been compiled - std::atomic compiled {}; - // number of registers used, extracted from pipeline executable properties - uint32_t register_count {}; - -#if defined(VK_EXT_shader_64bit_indexing) - bool is_64b_indexing {}; -#endif - // linked list of pipelines for multiple compilation variants. - // currently only used to compile a 64-bit indexing variant. - vk_pipeline next; -}; - -typedef std::weak_ptr vk_pipeline_ref; - -static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline); - -struct vk_matmul_pipeline_struct { - vk_pipeline l, m, s; - vk_pipeline a_l, a_m, a_s; - // Returns true when all unaligned pipelines are null. - // We only check for unaligned variants since one of the unaligned pipelines must exist - // while aligned pipelines are optional - bool is_empty() const { - return l == nullptr && m == nullptr && s == nullptr; - } -}; -typedef std::shared_ptr vk_matmul_pipeline; - -struct vk_matmul_pipeline2 { - vk_matmul_pipeline2() { - f16acc = std::make_shared(); - f32acc = std::make_shared(); - } - vk_matmul_pipeline f32acc; - vk_matmul_pipeline f16acc; -}; - -struct vk_device_struct; -typedef std::shared_ptr vk_device; -typedef std::weak_ptr vk_device_ref; - -struct vk_buffer_struct; -typedef std::shared_ptr vk_buffer; -typedef std::weak_ptr vk_buffer_ref; - -struct ggml_backend_vk_buffer_type_context { - std::string name; - vk_device device; -}; - -struct vk_queue; - -struct vk_command_buffer { - vk::CommandBuffer buf; - uint64_t use_counter = 0; - bool in_use = false; -}; - -// Stores command pool/buffers. There's an instance of this -// for each (context,queue) pair and for each (device,queue) pair. -struct vk_command_pool { - void init(vk_device& device, vk_queue *q_); - void destroy(vk::Device& device); - - vk::CommandPool pool; - // Using deque so the pointers to command buffers - // remain valid even if we add more - std::deque cmd_buffers; - - vk_queue *q; - - size_t buffers_in_use() const { - return std::count_if(cmd_buffers.begin(), cmd_buffers.end(), - [](const auto& cb) { return cb.in_use; }); - } -}; - -// Prevent simultaneous submissions to the same queue. -// This could be per vk_queue if we stopped having two vk_queue structures -// sharing the same vk::Queue. -static std::mutex queue_mutex; - -struct vk_queue { - uint32_t queue_family_index; - vk::Queue queue; - - vk_command_pool cmd_pool; - - vk::PipelineStageFlags stage_flags; - - bool transfer_only; - - // copy everything except the cmd_pool - void copyFrom(vk_queue &other) { - queue_family_index = other.queue_family_index; - queue = other.queue; - stage_flags = other.stage_flags; - transfer_only = other.transfer_only; - } -}; - -static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft); -static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size); -static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft); -static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft); -static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor); -static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { - /* .get_name = */ ggml_backend_vk_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, - /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size, - /* .is_host = */ NULL, -}; - -class vk_memory_logger; -class vk_perf_logger; -static void ggml_vk_destroy_buffer(vk_buffer& buf); -static void ggml_vk_synchronize(ggml_backend_vk_context * ctx); - -static constexpr uint32_t mul_mat_vec_max_cols = 8; -static constexpr uint32_t p021_max_gqa_ratio = 8; - -enum vk_device_architecture { - OTHER, - AMD_GCN, - AMD_RDNA1, - AMD_RDNA2, - AMD_RDNA3, - INTEL_XE2, - NVIDIA_PRE_TURING, - NVIDIA_TURING, -}; - -static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) { - vk::PhysicalDeviceProperties props = device.getProperties(); - - if (props.vendorID == VK_VENDOR_ID_AMD) { - const std::vector ext_props = device.enumerateDeviceExtensionProperties(); - - bool amd_shader_core_properties = false; - bool integer_dot_product = false; - bool subgroup_size_control = false; - - for (const auto& properties : ext_props) { - if (strcmp("VK_AMD_shader_core_properties", properties.extensionName) == 0) { - amd_shader_core_properties = true; - } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) { - integer_dot_product = true; - } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { - subgroup_size_control = true; - } - } - - if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control) { - return vk_device_architecture::OTHER; - } - - vk::PhysicalDeviceProperties2 props2; - vk::PhysicalDeviceShaderCorePropertiesAMD shader_core_props_amd; - vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props; - vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; - - props2.pNext = &shader_core_props_amd; - shader_core_props_amd.pNext = &integer_dot_props; - integer_dot_props.pNext = &subgroup_size_control_props; - - device.getProperties2(&props2); - - if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 64) { - return vk_device_architecture::AMD_GCN; - } - if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 32) { - // RDNA - if (shader_core_props_amd.wavefrontsPerSimd == 20) { - return vk_device_architecture::AMD_RDNA1; - } - if (integer_dot_props.integerDotProduct4x8BitPackedMixedSignednessAccelerated) { - return vk_device_architecture::AMD_RDNA3; - } - return vk_device_architecture::AMD_RDNA2; - } - } else if (props.vendorID == VK_VENDOR_ID_INTEL) { - const std::vector ext_props = device.enumerateDeviceExtensionProperties(); - - bool subgroup_size_control = false; - - for (const auto& properties : ext_props) { - if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { - subgroup_size_control = true; - } - } - - if (!subgroup_size_control) { - return vk_device_architecture::OTHER; - } - - vk::PhysicalDeviceProperties2 props2; - vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; - - props2.pNext = &subgroup_size_control_props; - device.getProperties2(&props2); - - if (subgroup_size_control_props.minSubgroupSize == 16) { - // Xe2 architecture uses SIMD16 while previous Xe and Gen architecture uses SIMD8. - // Minimum subgroup size matches the SIMD width so we distinguish architecture by checking this value. - // https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html - // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html - return vk_device_architecture::INTEL_XE2; - } - } else if (props.vendorID == VK_VENDOR_ID_NVIDIA) { - const std::vector ext_props = device.enumerateDeviceExtensionProperties(); - - bool cooperative_matrix = false; - bool sm_builtins = false; - - // Detect "pre-turing" based on lack of coopmat support. - for (const auto& properties : ext_props) { - if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) { - cooperative_matrix = true; - } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) { - sm_builtins = true; - } - } - - if (!cooperative_matrix) { - return vk_device_architecture::NVIDIA_PRE_TURING; - } - - if (sm_builtins) { - vk::PhysicalDeviceProperties2 props2; - vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; - - props2.pNext = &sm_props; - - device.getProperties2(&props2); - - // Turing has 32, following architectures have 48 - if (sm_props.shaderWarpsPerSM == 32) { - return vk_device_architecture::NVIDIA_TURING; - } - } - } - return vk_device_architecture::OTHER; -} - -enum vk_conv_shapes { - CONV_SHAPE_128x128, - CONV_SHAPE_64x32, - CONV_SHAPE_32x256, - CONV_SHAPE_64x128, - CONV_SHAPE_COUNT, -}; - -struct vk_conv_block_size { - uint32_t K; - uint32_t NPQ; - uint32_t CRS; -}; - -vk_conv_block_size vk_conv_block_sizes[CONV_SHAPE_COUNT] = { - // K NPQ CRS - { 128, 128, 16 }, // CONV_SHAPE_128x128 - { 64, 32, 32 }, // CONV_SHAPE_64x32 - { 32, 256, 16 }, // CONV_SHAPE_32x256 - { 64, 128, 16 }, // CONV_SHAPE_64x128 -}; - -enum dmmv_wg_sizes { - DMMV_WG_SIZE_SUBGROUP, - DMMV_WG_SIZE_LARGE, - DMMV_WG_SIZE_COUNT, -}; - -enum FaCodePath { - FA_SCALAR, - FA_COOPMAT1, - FA_COOPMAT2, -}; - -struct vk_fa_pipeline_state { - uint32_t HSK, HSV; - uint32_t Br, Bc; - uint32_t D_split, row_split; - bool shmem_staging; - FaCodePath path; - uint32_t workgroup_size, subgroup_size; - bool aligned; - bool f32acc; - uint32_t flags; - uint32_t limit_occupancy_shmem; - ggml_type k_type; - ggml_type v_type; - - bool operator<(const vk_fa_pipeline_state &b) const { - return std::tie(HSK, HSV, Br, Bc, D_split, row_split, shmem_staging, path, workgroup_size, subgroup_size, aligned, f32acc, flags, limit_occupancy_shmem, k_type, v_type) < - std::tie(b.HSK, b.HSV, b.Br, b.Bc, b.D_split, b.row_split, b.shmem_staging, b.path, b.workgroup_size, b.subgroup_size, b.aligned, b.f32acc, b.flags, b.limit_occupancy_shmem, b.k_type, b.v_type); - } -}; - -struct vk_conv2d_pipeline_state { - vk_conv2d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t p0, uint32_t p1, uint32_t d0, uint32_t d1, uint32_t KW, uint32_t KH, uint32_t aligned) - : s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), KW(KW), KH(KH), aligned(aligned) {} - - uint32_t s0, s1, p0, p1, d0, d1, KW, KH; - // when set, shader can skip K/CRS/NPQ bounds checks and address clamps - uint32_t aligned; - - bool operator<(const vk_conv2d_pipeline_state &b) const { - return std::tie(s0, s1, p0, p1, d0, d1, KW, KH, aligned) < - std::tie(b.s0, b.s1, b.p0, b.p1, b.d0, b.d1, b.KW, b.KH, b.aligned); - } -}; - -struct vk_solve_tri_pipeline_state { - vk_solve_tri_pipeline_state(uint32_t N, uint32_t K) - : N(N), K(K) {} - - uint32_t N, K; - - bool operator<(const vk_solve_tri_pipeline_state &b) const { - return std::tie(N, K) < - std::tie(b.N, b.K); - } -}; - -enum shader_reduction_mode { - SHADER_REDUCTION_MODE_SHMEM, - SHADER_REDUCTION_MODE_HYBRID, - SHADER_REDUCTION_MODE_SUBGROUP, - SHADER_REDUCTION_MODE_COUNT, -}; - -// argsort pipelines for up to 1<<10 invocations per workgroup -static constexpr uint32_t num_argsort_pipelines = 11; -static constexpr uint32_t num_topk_moe_pipelines = 10; -static constexpr uint32_t num_topk_pipelines = 11; - -static constexpr std::initializer_list topk_moe_early_softmax_norm{ GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT, - GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE, - GGML_OP_SUM_ROWS, GGML_OP_CLAMP, GGML_OP_DIV, - GGML_OP_RESHAPE }; - -static constexpr std::initializer_list topk_moe_sigmoid_norm_bias{ GGML_OP_UNARY, GGML_OP_RESHAPE, GGML_OP_ADD, - GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS, - GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP, - GGML_OP_DIV, GGML_OP_RESHAPE }; - -static constexpr std::initializer_list topk_moe_early_softmax { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT, - GGML_OP_VIEW, GGML_OP_GET_ROWS }; - -static constexpr std::initializer_list topk_moe_late_softmax { GGML_OP_ARGSORT, GGML_OP_VIEW, - GGML_OP_GET_ROWS, GGML_OP_RESHAPE, - GGML_OP_SOFT_MAX, GGML_OP_RESHAPE }; - -// Snake activation: y = x + sin(a*x)^2 * inv_b. Used by the optimize_graph reorder -// pass so it keeps the chain contiguous and by the dispatcher to detect the fusion. -static constexpr std::initializer_list snake_pattern { GGML_OP_MUL, GGML_OP_SIN, - GGML_OP_SQR, GGML_OP_MUL, - GGML_OP_ADD }; - -//node #978 ( SOFT_MAX): ffn_moe_probs-15 ( 0K) [Vulka ] use=2: ffn_moe_logits-15 ( 0K) [Vulka ] -//node #979 ( RESHAPE): ffn_moe_probs-15 (re ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ] -//node #980 ( ARGSORT): ffn_moe_argsort-15 ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ] -//node #981 ( VIEW): ffn_moe_topk-15 ( 0K) [Vulka ] use=4: ffn_moe_argsort-15 ( 0K) [Vulka ] -//node #982 ( GET_ROWS): ffn_moe_weights-15 ( 0K) [Vulka ] use=1: ffn_moe_probs-15 (re ( 0K) [Vulka ] ffn_moe_topk-15 ( 0K) [Vulka ] -//node #983 ( RESHAPE): ffn_moe_weights-15 ( ( 0K) [Vulka ] use=2: ffn_moe_weights-15 ( 0K) [Vulka ] -//node #984 ( SUM_ROWS): ffn_moe_weights_sum- ( 0K) [Vulka ] use=1: ffn_moe_weights-15 ( ( 0K) [Vulka ] -//node #985 ( CLAMP): ffn_moe_weights_sum_ ( 0K) [Vulka ] use=1: ffn_moe_weights_sum- ( 0K) [Vulka ] -//node #986 ( DIV): ffn_moe_weights_norm ( 0K) [Vulka ] use=1: ffn_moe_weights-15 ( ( 0K) [Vulka ] ffn_moe_weights_sum_ ( 0K) [Vulka ] -//node #987 ( RESHAPE): ffn_moe_weights_norm ( 0K) [Vulka ] use=1: ffn_moe_weights_norm ( 0K) [Vulka ] -static constexpr std::initializer_list> topk_moe_early_softmax_norm_edges { - { 1, 0, 0 }, // reshape->src[0] == softmax - { 2, 0, 0 }, // argsort->src[0] == softmax - { 3, 0, 2 }, // view->src[0] == argsort - { 4, 0, 1 }, // get_rows->src[0] == reshape - { 4, 1, 3 }, // get_rows->src[1] == view - { 5, 0, 4 }, // reshape->src[0] == get_rows - { 6, 0, 5 }, // sum_rows->src[0] == reshape - { 7, 0, 6 }, // clamp->src[0] == sum_rows - { 8, 0, 5 }, // div->src[0] == reshape - { 8, 1, 7 }, // div->src[1] == clamp - { 9, 0, 8 }, // reshape->src[0] == div -}; - -//node #436 ( UNARY): ffn_moe_probs-10 ( 256K) [Vulka ] use=2: ffn_moe_logits-10 ( 256K) [Vulka ] -//node #437 ( RESHAPE): ffn_moe_probs-10 (re ( 256K) [Vulka ] use=1: ffn_moe_probs-10 ( 256K) [Vulka ] -//node #438 ( ADD): ffn_moe_probs_biased ( 256K) [Vulka ] use=1: ffn_moe_probs-10 ( 256K) [Vulka ] blk.10.exp_probs_b.b ( 0K) [Vulka ] -//node #439 ( ARGSORT): ffn_moe_argsort-10 ( 256K) [Vulka ] use=1: ffn_moe_probs_biased ( 256K) [Vulka ] -//node #440 ( VIEW): ffn_moe_topk-10 ( 255K) [Vulka ] use=3: ffn_moe_argsort-10 ( 256K) [Vulka ] -//node #441 ( GET_ROWS): ffn_moe_weights-10 ( 12K) [Vulka ] use=1: ffn_moe_probs-10 (re ( 256K) [Vulka ] ffn_moe_topk-10 ( 255K) [Vulka ] -//node #442 ( RESHAPE): ffn_moe_weights-10 ( ( 12K) [Vulka ] use=2: ffn_moe_weights-10 ( 12K) [Vulka ] -//node #443 ( SUM_ROWS): ffn_moe_weights_sum- ( 2K) [Vulka ] use=1: ffn_moe_weights-10 ( ( 12K) [Vulka ] -//node #444 ( CLAMP): ffn_moe_weights_sum_ ( 2K) [Vulka ] use=1: ffn_moe_weights_sum- ( 2K) [Vulka ] -//node #445 ( DIV): ffn_moe_weights_norm ( 12K) [Vulka ] use=1: ffn_moe_weights-10 ( ( 12K) [Vulka ] ffn_moe_weights_sum_ ( 2K) [Vulka ] -//node #446 ( RESHAPE): ffn_moe_weights_norm ( 12K) [Vulka ] use=1: ffn_moe_weights_norm ( 12K) [Vulka ] -static constexpr std::initializer_list> topk_moe_sigmoid_norm_bias_edges { - { 1, 0, 0 }, // reshape->src[0] == sigmoid - { 2, 0, 0 }, // add->src[0] == sigmoid - { 3, 0, 2 }, // argsort->src[0] == add - { 4, 0, 3 }, // view->src[0] == argsort - { 5, 0, 1 }, // get_rows->src[0] == reshape - { 5, 1, 4 }, // get_rows->src[1] == view - { 6, 0, 5 }, // reshape->src[0] == get_rows - { 7, 0, 6 }, // sum_rows->src[0] == reshape - { 8, 0, 7 }, // clamp->src[0] == sum_rows - { 9, 0, 6 }, // div->src[0] == reshape - { 9, 1, 8 }, // div->src[1] == clamp - {10, 0, 9 }, // reshape->src[0] == div -}; - -// same as early_softmax_norm but ending after the get_rows -static constexpr std::initializer_list> topk_moe_early_softmax_edges { - { 1, 0, 0 }, // reshape->src[0] == softmax - { 2, 0, 0 }, // argsort->src[0] == softmax - { 3, 0, 2 }, // view->src[0] == argsort - { 4, 0, 1 }, // get_rows->src[0] == reshape - { 4, 1, 3 }, // get_rows->src[1] == view -}; - -//node #652 ( ARGSORT): ffn_moe_argsort-11 ( 0K) [Vulka ] use=1: ffn_moe_probs-11 ( 0K) [Vulka ] -//node #653 ( VIEW): ffn_moe_topk-11 ( 0K) [Vulka ] use=7: ffn_moe_argsort-11 ( 0K) [Vulka ] -//node #654 ( GET_ROWS): ffn_moe_weights-11 ( 0K) [Vulka ] use=1: ffn_moe_probs-11 (re ( 0K) [Vulka ] ffn_moe_topk-11 ( 0K) [Vulka ] -//node #655 ( RESHAPE): ffn_moe_weights-11 ( ( 0K) [Vulka ] use=1: ffn_moe_weights-11 ( 0K) [Vulka ] -//node #656 ( SOFT_MAX): node_656 ( 0K) [Vulka ] use=1: ffn_moe_weights-11 ( ( 0K) [Vulka ] -//node #657 ( RESHAPE): ffn_moe_weights_soft ( 0K) [Vulka ] use=1: node_656 ( 0K) [Vulka ] -static constexpr std::initializer_list> topk_moe_late_softmax_edges { - { 1, 0, 0 }, // view->src[0] == argsort - { 2, 1, 1 }, // get_rows->src[1] == view - { 3, 0, 2 }, // reshape->src[0] == get_rows - { 4, 0, 3 }, // soft_max->src[0] == reshape - { 5, 0, 4 }, // reshape->src[0] == soft_max -}; - -enum topk_moe_mode { - TOPK_MOE_EARLY_SOFTMAX, - TOPK_MOE_EARLY_SOFTMAX_NORM, - TOPK_MOE_LATE_SOFTMAX, - TOPK_MOE_SIGMOID_NORM_BIAS, - TOPK_MOE_COUNT, -}; - -static constexpr std::initializer_list> rope_view_set_rows_edges { - { 1, 0, 0 }, // view->src[0] == rope - { 2, 0, 1 }, // set_rows->src[0] == view -}; - -static constexpr std::initializer_list> rms_norm_mul_rope_view_set_rows_edges { - { 1, 0, 0 }, // mul->src[0] == rms - { 2, 0, 1 }, // rope->src[0] == mul - { 3, 0, 2 }, // view->src[0] == rope - { 4, 0, 3 }, // set_rows->src[0] == view -}; - - -struct vk_device_struct { - std::recursive_mutex mutex; - mutable std::shared_mutex pinned_memory_mutex; - - // Guards compile_pending, all_pipelines, and the dynamic pipeline maps - // (flash_attn, fa_mask_opt, solve_tri, conv2d, etc). The actual compile - // runs with no lock held, so different pipelines can compile in parallel. - // Lock order is device->mutex -> compile_mutex, never the reverse. - std::mutex compile_mutex; - std::condition_variable compile_cv; - - vk::PhysicalDevice physical_device; - vk::PhysicalDeviceProperties properties; - std::string name; - uint64_t max_memory_allocation_size; - uint64_t max_buffer_size; - uint64_t suballocation_block_size; - uint64_t min_imported_host_pointer_alignment; - bool external_memory_host {}; - bool fp16; - bool bf16; - bool pipeline_robustness; - bool memory_priority; - vk::Device device; - uint32_t vendor_id; - vk::DriverId driver_id; - vk_device_architecture architecture; - vk_queue compute_queue; - vk_queue transfer_queue; - bool single_queue; - bool support_async; - bool async_use_transfer_queue; - uint32_t subgroup_size; - uint32_t subgroup_size_log2; - uint32_t shader_core_count; - bool uma; - bool prefer_host_memory; - bool float_controls_rte_fp16; - bool subgroup_basic; - bool subgroup_arithmetic; - bool subgroup_shuffle; - bool subgroup_ballot; - bool subgroup_clustered; - bool subgroup_vote; - bool multi_add; - bool shader_int64; - bool buffer_device_address; - bool vulkan_memory_model; - - bool add_rms_fusion; - uint32_t partials_binding_alignment; - - bool shader_64b_indexing; - - bool integer_dot_product; - // 0: default, 1: force mmvq, -1: disable mmvq - int32_t mmvq_mode; - - bool subgroup_size_control; - uint32_t subgroup_min_size; - uint32_t subgroup_max_size; - bool subgroup_require_full_support; - - // floor(log2(maxComputeWorkGroupInvocations)) - uint32_t max_workgroup_size_log2 {}; - - bool coopmat_support; - bool coopmat_acc_f32_support {}; - bool coopmat_acc_f16_support {}; - bool coopmat_bf16_support {}; - bool coopmat_support_16x16x16_f16acc {}; - bool coopmat_support_16x16x16_f32acc {}; - bool coopmat1_fa_support {}; - uint32_t coopmat_m; - uint32_t coopmat_n; - uint32_t coopmat_k; - - bool coopmat_int_support; - uint32_t coopmat_int_m; - uint32_t coopmat_int_n; - uint32_t coopmat_int_k; - - bool coopmat2; - bool coopmat2_bf16_support {}; - bool coopmat2_decode_vector; - - bool dot2_f16 {}; - - bool pipeline_executable_properties_support {}; - - size_t idx; - - bool mul_mat_l[GGML_TYPE_COUNT]; - bool mul_mat_m[GGML_TYPE_COUNT]; - bool mul_mat_s[GGML_TYPE_COUNT]; - bool mul_mat_id_l[GGML_TYPE_COUNT]; - bool mul_mat_id_m[GGML_TYPE_COUNT]; - bool mul_mat_id_s[GGML_TYPE_COUNT]; - - // Separate flags for the q8_1 (integer dot) mmq path, whose shader uses - // a different shared-memory layout than the float matmul shaders. - bool mul_mat_l_int[GGML_TYPE_COUNT]; - bool mul_mat_m_int[GGML_TYPE_COUNT]; - bool mul_mat_s_int[GGML_TYPE_COUNT]; - bool mul_mat_id_l_int[GGML_TYPE_COUNT]; - bool mul_mat_id_m_int[GGML_TYPE_COUNT]; - bool mul_mat_id_s_int[GGML_TYPE_COUNT]; - - vk::DescriptorSetLayout dsl; - - vk_matmul_pipeline pipeline_matmul_f32 {}; - vk_matmul_pipeline pipeline_matmul_f32_f16 {}; - vk_matmul_pipeline pipeline_matmul_bf16 {}; - vk_matmul_pipeline2 pipeline_matmul_f16; - vk_matmul_pipeline2 pipeline_matmul_f16_f32; - - vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT]; - vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT]; - vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_COUNT]; - - vk_matmul_pipeline pipeline_matmul_id_f32 {}; - vk_matmul_pipeline pipeline_matmul_id_bf16 {}; - vk_matmul_pipeline2 pipeline_matmul_id_f16; - vk_matmul_pipeline2 pipeline_matmul_id_f16_f32; - - vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT]; - vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_COUNT]; - - vk_pipeline pipeline_matmul_split_k_reduce; - vk_pipeline pipeline_quantize_q8_1_x4; - - vk_pipeline pipeline_dequant[GGML_TYPE_COUNT]; - vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT][mul_mat_vec_max_cols]; - vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT][mul_mat_vec_max_cols]; - vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT]; - - vk_pipeline pipeline_dequant_mul_mat_vec_q8_1_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT][mul_mat_vec_max_cols]; - vk_pipeline pipeline_dequant_mul_mat_vec_id_q8_1_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT]; - - vk_pipeline pipeline_mul_mat_vec_p021_f16_f32[p021_max_gqa_ratio]; - vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; - vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT]; - vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT]; - vk_pipeline pipeline_acc_f32; - vk_pipeline pipeline_set_f32; - - // [src0 0=fp32,1=fp16][src1 0=fp32,1=fp16][dst 0=fp32,1=fp16] - vk_pipeline pipeline_add[2][2][2]; - vk_pipeline pipeline_add_norepeat[2][2][2]; - vk_pipeline pipeline_sub[2][2][2]; - vk_pipeline pipeline_sub_norepeat[2][2][2]; - vk_pipeline pipeline_mul[2][2][2]; - vk_pipeline pipeline_mul_norepeat[2][2][2]; - vk_pipeline pipeline_div[2][2][2]; - vk_pipeline pipeline_div_norepeat[2][2][2]; - vk_pipeline pipeline_add_rms[2][2][2]; - vk_pipeline pipeline_add_rms_norepeat[2][2][2]; - - // indexed by num_additional_fused_ops == num_adds - 1 - vk_pipeline pipeline_multi_add[MAX_FUSED_ADDS]; - vk_pipeline pipeline_multi_add_rms[MAX_FUSED_ADDS]; - - vk_pipeline pipeline_add_id_f32; - - vk_pipeline pipeline_concat_i8, pipeline_concat_i16, pipeline_concat_i32, pipeline_concat_i64; - vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bicubic_f32, pipeline_upscale_bilinear_antialias_f32; - vk_pipeline pipeline_scale_f32; - vk_pipeline pipeline_sqr_f32; - vk_pipeline pipeline_sqrt_f32; - vk_pipeline pipeline_sin_f32; - vk_pipeline pipeline_cos_f32; - vk_pipeline pipeline_log[2]; - vk_pipeline pipeline_tri[2]; - vk_pipeline pipeline_diag[2]; - vk_pipeline pipeline_clamp_f32; - vk_pipeline pipeline_pad_f32; - vk_pipeline pipeline_roll_f32; - vk_pipeline pipeline_repeat_i32, pipeline_repeat_back_f32; - vk_pipeline pipeline_repeat_i16; - vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16, pipeline_cpy_bf16_f32, pipeline_cpy_f32_i32, pipeline_cpy_i32_f32; - vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_bf16_f32, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32; - vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT]; - vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT]; - vk_pipeline pipeline_cpy_transpose_16, pipeline_cpy_transpose_32; - vk_pipeline pipeline_set_rows_i32[GGML_TYPE_COUNT]; - vk_pipeline pipeline_set_rows_i64[GGML_TYPE_COUNT]; - vk_pipeline pipeline_norm_f32; - vk_pipeline pipeline_group_norm_f32; - vk_pipeline pipeline_rms_norm_f32; - vk_pipeline pipeline_rms_norm_mul_f32; - vk_pipeline pipeline_rms_norm_partials_f32; - vk_pipeline pipeline_rms_norm_mul_partials_f32; - vk_pipeline pipeline_rms_norm_mul_rope_f32_f32; - vk_pipeline pipeline_rms_norm_mul_rope_f32_f16; - vk_pipeline pipeline_rms_norm_back_f32; - vk_pipeline pipeline_l2_norm_f32; - - // [src/dst 0=fp32,1=fp16] - vk_pipeline pipeline_exp[2]; - vk_pipeline pipeline_expm1[2]; - vk_pipeline pipeline_elu[2]; - vk_pipeline pipeline_gelu[2]; - vk_pipeline pipeline_gelu_erf[2]; - vk_pipeline pipeline_gelu_quick[2]; - vk_pipeline pipeline_silu[2]; - vk_pipeline pipeline_relu[2]; - vk_pipeline pipeline_xielu[2]; - vk_pipeline pipeline_neg[2]; - vk_pipeline pipeline_tanh[2]; - vk_pipeline pipeline_sigmoid[2]; - vk_pipeline pipeline_hardsigmoid[2]; - vk_pipeline pipeline_hardswish[2]; - vk_pipeline pipeline_abs[2]; - vk_pipeline pipeline_softplus[2]; - vk_pipeline pipeline_step[2]; - vk_pipeline pipeline_round[2]; - vk_pipeline pipeline_ceil[2]; - vk_pipeline pipeline_floor[2]; - vk_pipeline pipeline_trunc[2]; - vk_pipeline pipeline_sgn[2]; - - vk_pipeline pipeline_add1_f16_f16; - vk_pipeline pipeline_add1_f16_f32; - vk_pipeline pipeline_add1_f32_f32; - - vk_pipeline pipeline_arange_f32; - - vk_pipeline pipeline_fill_f32; - vk_pipeline pipeline_fill_f16; - - vk_pipeline pipeline_geglu[2]; - vk_pipeline pipeline_reglu[2]; - vk_pipeline pipeline_swiglu[2]; - vk_pipeline pipeline_swiglu_oai[2]; - vk_pipeline pipeline_geglu_erf[2]; - vk_pipeline pipeline_geglu_quick[2]; - - vk_pipeline pipeline_leaky_relu_f32; - vk_pipeline pipeline_silu_back_f32; - vk_pipeline pipeline_diag_mask_inf_f32; - vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16; - vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512; - vk_pipeline pipeline_soft_max_back_f32; - - vk_pipeline pipeline_soft_max_large1_f32, pipeline_soft_max_large1_f32_f16; - vk_pipeline pipeline_soft_max_large2_f32, pipeline_soft_max_large2_f32_f16; - vk_pipeline pipeline_soft_max_large3_f32, pipeline_soft_max_large3_f32_f16; - - vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16; - vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16; - vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16, pipeline_rope_multi_f32_f16; - vk_pipeline pipeline_rope_vision_f32, pipeline_rope_vision_f16; - vk_pipeline pipeline_argsort_f32[num_argsort_pipelines]; - vk_pipeline pipeline_argsort_large_f32[num_argsort_pipelines]; - vk_pipeline pipeline_topk_f32[num_topk_pipelines]; - vk_pipeline pipeline_sum_rows_f32; - vk_pipeline pipeline_fwht_f32[4]; - vk_pipeline pipeline_cumsum_f32; - vk_pipeline pipeline_cumsum_small_f32; - vk_pipeline pipeline_cumsum_multipass1_f32; - vk_pipeline pipeline_cumsum_multipass2_f32; - vk_pipeline pipeline_argmax_f32; - vk_pipeline pipeline_count_equal_i32; - std::map pipeline_solve_tri_f32; - vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16; - vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16; - vk_pipeline pipeline_timestep_embedding_f32; - vk_pipeline pipeline_conv_transpose_1d_f32; - vk_pipeline pipeline_col2im_1d_f32; - vk_pipeline pipeline_col2im_1d_f16; - vk_pipeline pipeline_col2im_1d_bf16; - vk_pipeline pipeline_snake_f32; - vk_pipeline pipeline_snake_f16; - vk_pipeline pipeline_snake_bf16; - vk_pipeline pipeline_pool2d_f32; - vk_pipeline pipeline_rwkv_wkv6_f32; - vk_pipeline pipeline_rwkv_wkv7_f32; - // [size_idx][kda] where size_idx: 0=d16, 1=d32, 2=d64, 3=d128 - vk_pipeline pipeline_gated_delta_net[4][2]; - vk_pipeline pipeline_ssm_scan_f32_d128; - vk_pipeline pipeline_ssm_scan_f32_d256; - vk_pipeline pipeline_ssm_conv_f32; - vk_pipeline pipeline_ssm_conv_silu_f32; - vk_pipeline pipeline_ssm_conv_bias_silu_f32; - vk_pipeline pipeline_opt_step_adamw_f32; - vk_pipeline pipeline_opt_step_sgd_f32; - std::map pipeline_conv2d_f32[CONV_SHAPE_COUNT]; - std::map pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT]; - std::map pipeline_conv_transpose_2d_f32[CONV_SHAPE_COUNT]; - std::map pipeline_conv_transpose_2d_f16_f32[CONV_SHAPE_COUNT]; - vk_pipeline pipeline_conv2d_dw_whcn_f32, pipeline_conv2d_dw_whcn_f16_f32; - vk_pipeline pipeline_conv2d_dw_cwhn_f32, pipeline_conv2d_dw_cwhn_f16_f32; - - std::map pipeline_flash_attn_f32_f16; - - std::map, vk_pipeline> pipeline_fa_mask_opt; - - vk_pipeline pipeline_flash_attn_split_k_reduce; - vk_pipeline pipeline_count_experts; - - // [2] is for whether to take n_experts from spec constant (0) or push constant (1) - vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][2]; - - std::vector all_pipelines; - - std::vector> pinned_memory; - - vk::Fence fence; - vk_buffer sync_staging; - - ggml_backend_buffer_type buffer_type; - - bool disable_fusion; - bool disable_host_visible_vidmem; - bool allow_sysmem_fallback; - bool disable_graph_optimize; - - std::unique_ptr memory_logger; - - ~vk_device_struct() { - VK_LOG_DEBUG("destroy device " << name); - - device.destroyFence(fence); - - ggml_vk_destroy_buffer(sync_staging); - - compute_queue.cmd_pool.destroy(device); - transfer_queue.cmd_pool.destroy(device); - - for (auto& pipeline : all_pipelines) { - if (pipeline.expired()) { - continue; - } - - vk_pipeline pl = pipeline.lock(); - ggml_vk_destroy_pipeline(device, pl); - } - all_pipelines.clear(); - - device.destroyDescriptorSetLayout(dsl); - - device.destroy(); - } -}; - -void vk_command_pool::init(vk_device& device, vk_queue *q_) { - cmd_buffers.clear(); - q = q_; - - vk::CommandPoolCreateInfo command_pool_create_info( - vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT), - q->queue_family_index); - pool = device->device.createCommandPool(command_pool_create_info); -} - -void vk_command_pool::destroy(vk::Device& device) { - device.destroyCommandPool(pool); - pool = nullptr; - cmd_buffers.clear(); -} - -struct vk_buffer_struct { - vk::Buffer buffer = VK_NULL_HANDLE; - vk::DeviceMemory device_memory = VK_NULL_HANDLE; - vk::MemoryPropertyFlags memory_property_flags; - void * ptr; - size_t size = 0; - vk::DeviceAddress bda_addr {}; - - vk_device device; - - ~vk_buffer_struct() { - if (size == 0) { - return; - } - VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")"); - - device->device.freeMemory(device_memory); - device->device.destroyBuffer(buffer); - } -}; - -struct vk_subbuffer { - vk_buffer buffer; - uint64_t offset; - uint64_t size; - - operator vk::DescriptorBufferInfo() const { - return { buffer->buffer, offset, size }; - } -}; - -struct vk_semaphore { - vk::Semaphore s; - uint64_t value; -}; - -// vk_event is used for the event-related backend interfaces. It uses vk::Events for -// event_wait and a timeline semaphore for event_synchronize. Polling on an event for -// event_synchronize wouldn't be sufficient to wait for command buffers to complete, -// and would lead to validation errors. -struct vk_event { - std::vector events_free; // Events available for reuse - std::vector events_submitted; // Events that are fully submitted and can be reused on next synchronize - vk::Event event; - bool has_event; - - vk_semaphore tl_semaphore; - vk_command_buffer* cmd_buffer = nullptr; - uint64_t cmd_buffer_use_counter = 0; -}; - -struct vk_submission { - vk_command_buffer* buffer = nullptr; - std::vector wait_semaphores; - std::vector signal_semaphores; -}; - -typedef std::vector vk_sequence; - -struct vk_mat_mat_push_constants { - uint32_t M; uint32_t N; uint32_t K; - uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; - uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; - uint32_t base_work_group_z; uint32_t num_batches; - uint32_t k_split; - uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3; - uint32_t padded_N; -}; - -#define MAT_VEC_FUSION_FLAGS_BIAS0 0x1 -#define MAT_VEC_FUSION_FLAGS_BIAS1 0x2 -#define MAT_VEC_FUSION_FLAGS_SCALE0 0x4 -#define MAT_VEC_FUSION_FLAGS_SCALE1 0x8 - -struct vk_mat_vec_push_constants { - uint32_t ncols; - uint32_t stride_a; - uint32_t stride_b; - uint32_t stride_d; - uint32_t batch_stride_a; - uint32_t batch_stride_b; - uint32_t batch_stride_d; - uint32_t fusion_flags; - uint32_t base_work_group_y; - uint32_t ne02; - uint32_t ne12; - uint32_t broadcast2; - uint32_t broadcast3; -}; - -struct vk_mat_vec_p021_push_constants { - uint32_t ncols_x; - uint32_t nrows_x; - uint32_t nchannels_x; - uint32_t nchannels_y; - uint32_t b_offset; - uint32_t d_offset; - uint32_t fusion_flags; -}; - -struct vk_mat_vec_nc_push_constants { - uint32_t ncols_x; - uint32_t nrows_x; - uint32_t row_stride_x; - uint32_t channel_stride_x; - uint32_t channel_stride_y; - uint32_t channel_x_divisor; - uint32_t ne12; - uint32_t b_offset; - uint32_t d_offset; - uint32_t nb03; - uint32_t nb13; - uint32_t nb23; - uint32_t fusion_flags; -}; - -struct vk_mat_mat_id_push_constants { - uint32_t M; uint32_t N; uint32_t K; - uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; - uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; - uint32_t nei0; uint32_t nei1; uint32_t nbi1; uint32_t ne11; - uint32_t padded_N; -}; -struct vk_mat_vec_id_push_constants { - uint32_t ncols; - uint32_t stride_a; - uint32_t stride_b; - uint32_t stride_d; - uint32_t batch_stride_a; - uint32_t batch_stride_b; - uint32_t batch_stride_d; - uint32_t fusion_flags; - uint32_t nei0; - uint32_t ne11; - uint32_t expert_i1; - uint32_t nbi1; -}; - -struct vk_flash_attn_push_constants { - uint32_t N; - uint32_t KV; - - uint32_t ne1; - uint32_t ne2; - uint32_t ne3; - - uint32_t neq2; - uint32_t neq3; - uint32_t nek2; - uint32_t nek3; - uint32_t nev2; - uint32_t nev3; - uint32_t nem1; - uint32_t nem2; - uint32_t nem3; - - uint32_t nb01; - uint32_t nb02; - uint32_t nb03; - uint32_t nb11; - uint32_t nb12; - uint32_t nb13; - uint32_t nb21; - uint32_t nb22; - uint32_t nb23; - - float scale; - float max_bias; - float logit_softcap; - - uint32_t mask_n_head_log2; - float m0; - float m1; - - uint32_t gqa_ratio; - uint32_t split_kv; - uint32_t k_num; -}; -static_assert(sizeof(vk_flash_attn_push_constants) <= 128, "sizeof(vk_flash_attn_push_constants) must be <= 128"); - -struct vk_op_push_constants { - uint32_t KX; - uint32_t KY; - float param1; - float param2; - float param3; - float param4; -}; - -struct vk_op_fwht_push_constants { - uint32_t n_rows; - uint32_t src_offset; - uint32_t dst_offset; - float scale; -}; - -struct vk_op_count_experts_push_constants { - uint32_t ne00; - uint32_t ne01; - uint32_t nb00; - uint32_t nb01; - uint32_t a_offset; -}; - -struct vk_op_glu_push_constants { - uint32_t N; - uint32_t ne00; - uint32_t ne20; - uint32_t mode; // 0: default, 1: swapped, 2: split - float alpha; // for swiglu_oai - float limit; - uint32_t nb00; - uint32_t nb01; - uint32_t nb02; - uint32_t nb03; - uint32_t nb10; - uint32_t nb11; - uint32_t nb12; - uint32_t nb13; - uint32_t nb20; - uint32_t nb21; - uint32_t nb22; - uint32_t nb23; - uint32_t ne21; - uint32_t ne22; - uint32_t misalign_offsets; - uint32_t ne2_012mp; uint32_t ne2_012L; - uint32_t ne2_01mp; uint32_t ne2_01L; - uint32_t ne2_0mp; uint32_t ne2_0L; -}; -static_assert(sizeof(vk_op_glu_push_constants) <= 128, "sizeof(vk_op_glu_push_constants) must be <= 128"); - -struct vk_op_unary_push_constants { - uint32_t ne; - uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; - uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; - uint32_t misalign_offsets; - float param1; float param2; float param3; float param4; - uint32_t ne0_012mp; uint32_t ne0_01mp; uint32_t ne0_0mp; uint32_t ne0_Ls; - uint32_t ne1_012mp; uint32_t ne1_01mp; uint32_t ne1_0mp; uint32_t ne1_Ls; -}; -static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128"); - -static vk_op_unary_push_constants vk_op_unary_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst, int64_t ne = 0) { - GGML_ASSERT(ne != 0 || (ggml_nelements(src0) == ggml_nelements(dst))); - ne = ne != 0 ? ne : ggml_nelements(dst); - GGML_ASSERT(ne <= (int64_t)std::numeric_limits::max()); - - vk_op_unary_push_constants p{}; - p.ne = (uint32_t)ne; - - size_t src0_tsize = ggml_type_size(src0->type); - p.ne00 = (uint32_t)src0->ne[0]; - p.ne01 = (uint32_t)src0->ne[1]; - p.ne02 = (uint32_t)src0->ne[2]; - p.ne03 = (uint32_t)src0->ne[3]; - p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize); - p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize); - p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize); - p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize); - - size_t dst_tsize = ggml_type_size(dst->type); - p.ne10 = (uint32_t)dst->ne[0]; - p.ne11 = (uint32_t)dst->ne[1]; - p.ne12 = (uint32_t)dst->ne[2]; - p.ne13 = (uint32_t)dst->ne[3]; - p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize); - p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize); - p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize); - p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize); - - return p; // offsets are initialized later in ggml_vk_op -} - -struct vk_op_pad_push_constants { - uint32_t ne; - uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; - uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; - uint32_t misalign_offsets; - uint32_t circular; - - uint32_t lp0; uint32_t rp0; - uint32_t lp1; uint32_t rp1; - uint32_t lp2; uint32_t rp2; - uint32_t lp3; uint32_t rp3; -}; - -static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst) { - int64_t ne = ggml_nelements(dst); - GGML_ASSERT(ne <= (int64_t)std::numeric_limits::max()); - - vk_op_pad_push_constants p{}; - p.ne = (uint32_t)ne; - - size_t src0_tsize = ggml_type_size(src0->type); - p.ne00 = (uint32_t)src0->ne[0]; - p.ne01 = (uint32_t)src0->ne[1]; - p.ne02 = (uint32_t)src0->ne[2]; - p.ne03 = (uint32_t)src0->ne[3]; - p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize); - p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize); - p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize); - p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize); - - size_t dst_tsize = ggml_type_size(dst->type); - p.ne10 = (uint32_t)dst->ne[0]; - p.ne11 = (uint32_t)dst->ne[1]; - p.ne12 = (uint32_t)dst->ne[2]; - p.ne13 = (uint32_t)dst->ne[3]; - p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize); - p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize); - p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize); - p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize); - - p.lp0 = dst->op_params[0]; - p.rp0 = dst->op_params[1]; - p.lp1 = dst->op_params[2]; - p.rp1 = dst->op_params[3]; - p.lp2 = dst->op_params[4]; - p.rp2 = dst->op_params[5]; - p.lp3 = dst->op_params[6]; - p.rp3 = dst->op_params[7]; - p.circular = dst->op_params[8]; - - return p; // fastdiv values and offsets are initialized later in ggml_vk_op -} - -// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. -// Precompute mp (m' in the paper) and L such that division -// can be computed using a multiply (high 32b of 64b result) -// and a shift: -// -// n/d = (mulhi(n, mp) + n) >> L; -static void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) -{ - // compute L = ceil(log2(d)); - L = 0; - while (L < 32 && (uint32_t{1} << L) < d) { - L++; - } - - mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1); -} - -static uint32_t pack_fastdiv_L(uint32_t L0, uint32_t L1, uint32_t L2) { - return L0 | (L1 << 8) | (L2 << 16); -} - -template void init_pushconst_fastdiv(T &p) { - GGML_UNUSED(p); - static_assert(!std::is_const::value, "unexpected type"); -} - -template <> void init_pushconst_fastdiv(vk_op_unary_push_constants &p) { - // Compute magic values to divide by these six numbers. - uint32_t ne0_012L; - uint32_t ne0_01L; - uint32_t ne0_0L; - uint32_t ne1_012L; - uint32_t ne1_01L; - uint32_t ne1_0L; - - init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, ne0_012L); - init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, ne0_01L); - init_fastdiv_values(p.ne00, p.ne0_0mp, ne0_0L); - init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, ne1_012L); - init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, ne1_01L); - init_fastdiv_values(p.ne10, p.ne1_0mp, ne1_0L); - - p.ne0_Ls = pack_fastdiv_L(ne0_012L, ne0_01L, ne0_0L); - p.ne1_Ls = pack_fastdiv_L(ne1_012L, ne1_01L, ne1_0L); -} - -template <> void init_pushconst_fastdiv(vk_op_glu_push_constants &p) { - // GLU linearizes over dst, then uses dst coordinates for src0/src1. - init_fastdiv_values(p.ne22*p.ne21*p.ne20, p.ne2_012mp, p.ne2_012L); - init_fastdiv_values(p.ne21*p.ne20, p.ne2_01mp, p.ne2_01L); - init_fastdiv_values(p.ne20, p.ne2_0mp, p.ne2_0L); -} - -struct vk_op_binary_push_constants { - uint32_t ne; - uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; - uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; - uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23; - uint32_t misalign_offsets; - float param1; float param2; int32_t param3; -}; - -struct vk_op_multi_add_push_constants { - // shape for dst - uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; - - // strides for srcs+dst - uint32_t nb[MAX_PARAMETER_COUNT][4]; - - uint32_t rms_partials; -}; -// update multi_add.comp if this changes -static_assert(MAX_PARAMETER_COUNT == 12); -static_assert(sizeof(vk_op_multi_add_push_constants) <= 256); - -struct vk_op_topk_moe_push_constants { - uint32_t n_rows; - uint32_t n_experts_push; - uint32_t n_expert_used; - float clamp_min; - float clamp_max; - uint32_t gating_func; - uint32_t has_bias; - uint32_t with_norm; - float output_scale; - float output_bias; -}; - -struct vk_op_add_id_push_constants { - uint32_t ne0; - uint32_t ne1; - uint32_t s01; - uint32_t s02; - uint32_t s11; - uint32_t s21; -}; - -struct vk_op_diag_mask_push_constants { - uint32_t ncols; - uint32_t rows_per_channel; - int32_t n_past; -}; - -struct vk_op_rope_push_constants { - uint32_t rope_mode; - uint32_t nrows; - uint32_t n_dims; - float freq_scale; - float freq_base; - float ext_factor; - float attn_factor; - float corr_dims[2]; - float theta_scale; - uint32_t has_ff; - int32_t sections[4]; - uint32_t is_imrope; - uint32_t is_back; - uint32_t set_rows_stride; - uint32_t ne00; - uint32_t ne01; - uint32_t ne02; - uint32_t nb01; - uint32_t nb02; - uint32_t nb03; - uint32_t nb11; - uint32_t nb12; - uint32_t nb13; - uint32_t a_offset; - uint32_t d_offset; -}; -static_assert(sizeof(vk_op_rope_push_constants) <= 128, "sizeof(vk_op_rope_push_constants) must be <= 128"); - -// For fused rms_norm+mul+rope(+view+set_rows) -struct vk_op_rms_norm_mul_rope_push_constants { - vk_op_binary_push_constants bin; - vk_op_rope_push_constants rope; -}; - -struct vk_op_soft_max_push_constants { - uint32_t KX; - uint32_t KY; - uint32_t ne00; - uint32_t ne01; - uint32_t ne02; - uint32_t ne12; - uint32_t ne13; - uint32_t nb11; - uint32_t nb12; - uint32_t nb13; - float scale; - float max_bias; - float m0; - float m1; - uint32_t n_head_log2; - uint32_t nrows_x; - uint32_t has_sinks; -}; - -struct vk_op_argsort_push_constants { - uint32_t ncols; - uint32_t ncols_padded; - uint32_t ncols_padded_log2; - uint32_t nrows; - uint32_t order; - uint32_t outer_start; - uint32_t outer_end; - uint32_t inner_start; - uint32_t inner_end; -}; - -struct vk_op_topk_push_constants { - uint32_t orig_ncols; - uint32_t ncols_input; - uint32_t ncols_output; - uint32_t k; - uint32_t nrows; - uint32_t first_pass; - uint32_t last_pass; -}; - -struct vk_op_im2col_push_constants { - uint64_t dst_addr; - uint32_t batch_offset; uint32_t offset_delta; - uint32_t IC; - uint32_t IW; uint32_t IH; - uint32_t OW; uint32_t OH; - uint32_t KW; uint32_t KH; - uint32_t OH_batch; - uint32_t CHW; - int32_t s0; int32_t s1; - int32_t p0; int32_t p1; - int32_t d0; int32_t d1; - uint32_t batch_IC; -}; - -struct vk_op_im2col_3d_push_constants { - uint64_t dst_addr; - uint32_t nb10; - uint32_t nb11; - uint32_t nb12; - uint32_t nb13; - uint32_t s0; - uint32_t s1; - uint32_t s2; - uint32_t p0; - uint32_t p1; - uint32_t p2; - uint32_t d0; - uint32_t d1; - uint32_t d2; - uint32_t IW; - uint32_t IH; - uint32_t ID; - uint32_t IC; - uint32_t KW; - uint32_t OH; - uint32_t KD_KH_KW; - uint32_t KH_KW; - uint32_t IC_KD_KH_KW; - uint32_t N_OD_OH; - uint32_t OD_OH; - uint32_t OD_OH_OW_IC_KD_KH_KW; - uint32_t OH_OW_IC_KD_KH_KW; - uint32_t OW_IC_KD_KH_KW; - uint32_t misalign_offsets; -}; - -struct vk_op_timestep_embedding_push_constants { - uint32_t nb1; - uint32_t dim; - uint32_t max_period; -}; - -struct vk_op_col2im_1d_push_constants { - uint32_t T_out; - uint32_t OC; - uint32_t K_OC; - uint32_t T_in; - uint32_t K; - int32_t stride; - int32_t p0; -}; - -struct vk_op_conv_transpose_1d_push_constants { - uint32_t Cout; - uint32_t Cin; - uint32_t K; - uint32_t L; - uint32_t KL; - - uint32_t nb01; - uint32_t nb02; - uint32_t nb11; - uint32_t nb1; - - int32_t s0; -}; - -struct vk_op_snake_push_constants { - uint32_t ne0; - uint32_t ne1; -}; - -struct vk_op_pool2d_push_constants { - uint32_t IW; uint32_t IH; - uint32_t OW; uint32_t OH; - uint32_t OC; - uint32_t pelements; - uint32_t op; - int32_t k0; int32_t k1; - int32_t s0; int32_t s1; - int32_t p0; int32_t p1; -}; - -struct vk_op_rwkv_wkv6_push_constants { - uint32_t B; - uint32_t T; - uint32_t C; - uint32_t H; -}; - -struct vk_op_rwkv_wkv7_push_constants { - uint32_t B; - uint32_t T; - uint32_t C; - uint32_t H; -}; -struct vk_op_gated_delta_net_push_constants { - uint32_t H; - uint32_t n_tokens; - uint32_t n_seqs; - uint32_t s_off; - uint32_t sq1, sq2, sq3; - uint32_t sv1, sv2, sv3; - uint32_t sb1, sb2, sb3; - uint32_t neq1, rq3; - float scale; - uint32_t K; -}; - -struct vk_op_ssm_scan_push_constants { - uint32_t nb02, nb03, nb12, nb13; - uint32_t nb21, nb22, nb31; - uint32_t nb42, nb43, nb52, nb53; - uint32_t s_off; - uint32_t n_head, d_head, n_group, n_tok; -}; -struct vk_op_ssm_conv_push_constants { - uint32_t nb01, nb02; - uint32_t nb11; - uint32_t dst_nb0, dst_nb1, dst_nb2; - uint32_t nc, ncs, nr, n_t, n_s; -}; - -struct vk_op_conv2d_push_constants { - uint32_t Cout; - uint32_t Cin; - uint32_t N; - - uint32_t W; - uint32_t H; - uint32_t OW; - uint32_t OH; - - uint32_t nb01; - uint32_t nb02; - uint32_t nb03; - - uint32_t nb11; - uint32_t nb12; - uint32_t nb13; - - uint32_t nb1; - uint32_t nb2; - uint32_t nb3; - - // init_fastdiv_values constants for dividing by OW, OW*OH - uint32_t OWmp; uint32_t OWL; - uint32_t OWOHmp; uint32_t OWOHL; -}; - -template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) { - // Compute magic values to divide by OW, OW*OH - init_fastdiv_values(p.OW, p.OWmp, p.OWL); - init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL); -} - -struct vk_op_conv2d_dw_push_constants { - uint32_t ne; - uint32_t batches; - uint32_t channels; - uint32_t dst_w; - uint32_t dst_h; - uint32_t src_w; - uint32_t src_h; - uint32_t knl_w; - uint32_t knl_h; - int32_t stride_x; - int32_t stride_y; - int32_t pad_x; - int32_t pad_y; - int32_t dilation_x; - int32_t dilation_y; -}; - -struct vk_op_upscale_push_constants { - uint32_t ne; uint32_t a_offset; uint32_t d_offset; - uint32_t ne00; uint32_t ne01; - uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; - uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; - float sf0; float sf1; float sf2; float sf3; - float pixel_offset; -}; - -struct vk_op_sum_rows_push_constants -{ - uint32_t n_cols; - uint32_t ne01, ne02; - uint32_t nb01, nb02, nb03; - uint32_t nb11, nb12, nb13; - float weight; - uint32_t misalign_offsets; - uint32_t ne0_12mp, ne0_12L; - uint32_t ne0_1mp, ne0_1L; -}; - -static vk_op_sum_rows_push_constants vk_op_sum_rows_push_constants_init(const ggml_tensor * src, const ggml_tensor * dst, int64_t n_cols) { - uint32_t type_size = (uint32_t)ggml_type_size(src->type); - vk_op_sum_rows_push_constants p = {}; - p.n_cols = (uint32_t)n_cols; - p.ne01 = (uint32_t)src->ne[1]; - p.ne02 = (uint32_t)src->ne[2]; - p.nb01 = (uint32_t)src->nb[1] / type_size; - p.nb02 = (uint32_t)src->nb[2] / type_size; - p.nb03 = (uint32_t)src->nb[3] / type_size; - p.nb11 = (uint32_t)dst->nb[1] / type_size; - p.nb12 = (uint32_t)dst->nb[2] / type_size; - p.nb13 = (uint32_t)dst->nb[3] / type_size; - p.weight = 1.0f; - return p; -} - -template <> void init_pushconst_fastdiv(vk_op_sum_rows_push_constants &p) { - init_fastdiv_values(p.ne01*p.ne02, p.ne0_12mp, p.ne0_12L); - init_fastdiv_values(p.ne01, p.ne0_1mp, p.ne0_1L); -} - -struct vk_quantize_q8_1_push_constants { - uint32_t ne; - uint32_t num_blocks; -}; - -struct vk_op_flash_attn_split_k_reduce_push_constants { - uint32_t D; - uint32_t ne1; - uint32_t ne2; - uint32_t ne3; - uint32_t k_num; - uint32_t sinks; -}; - -struct vk_op_flash_attn_mask_opt_push_constants { - uint32_t nem0; - uint32_t nem1; - uint32_t nem2; - uint32_t nbm1; - uint32_t nbm2; - uint32_t nbm3; - uint32_t nbd1; - uint32_t nbd2; - uint32_t nbd3; -}; - -// Allow pre-recording command buffers -struct vk_staging_memcpy { - vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {} - - void * dst; - const void * src; - size_t n; -}; - -struct vk_staging_memset { - vk_staging_memset(void * _dst, uint32_t _val, size_t _n) : dst(_dst), val(_val), n(_n) {} - - void * dst; - uint32_t val; - size_t n; -}; - -struct vk_context_struct { - vk_submission * s; - std::vector seqs; - - int exit_tensor_idx; - - std::vector in_memcpys; - std::vector out_memcpys; - std::vector memsets; - - vk_command_pool * p {}; -}; -typedef std::shared_ptr vk_context; -typedef std::weak_ptr vk_context_ref; - -struct ggml_vk_garbage_collector { - std::vector tl_semaphores; - std::vector semaphores; - std::vector events; - std::vector contexts; -}; - -static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx); -static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested = nullptr); -static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx); - -static bool vk_memory_logger_enabled = false; - -#define VK_LOG_MEMORY(msg) if (vk_memory_logger_enabled) { std::cerr << "ggml_vulkan memory: " << msg << std::endl; } - -static std::string format_size(size_t size) { - const size_t kib = 1024; - const size_t mib = kib * 1024; - const size_t gib = mib * 1024; - - std::ostringstream oss; - oss << std::fixed << std::setprecision(2); - - if (size >= gib) { - oss << static_cast(size) / gib << " GiB"; - } else if (size >= mib) { - oss << static_cast(size) / mib << " MiB"; - } else if (size >= kib) { - oss << static_cast(size) / kib << " KiB"; - } else { - oss << size << " B"; - } - - return oss.str(); -} - -class vk_memory_logger { -public: - vk_memory_logger(): total_device(0), total_host(0) {} - void log_allocation(vk_buffer_ref buf_ref, size_t size); - void log_deallocation(vk_buffer_ref buf_ref); - -private: - std::map allocations; // Track allocations - size_t total_device; - size_t total_host; - static std::mutex log_mutex; -}; - -std::mutex vk_memory_logger::log_mutex; - -static bool vk_perf_logger_enabled = false; -static bool vk_perf_logger_concurrent = false; -static bool vk_enable_sync_logger = false; -// number of calls between perf logger prints -static uint32_t vk_perf_logger_frequency = 1; -static std::string vk_pipeline_stats_filter; - -class vk_perf_logger { - public: - void print_timings(bool force = false) { - if (timings.empty()) { - return; - } - print_count++; - if ((print_count % vk_perf_logger_frequency) != 0 && !force) { - return; - } - print_count = 0; - uint64_t total_all_op_times = 0; - std::cerr << "----------------\nVulkan Timings:" << std::endl; - for (const auto & t : timings) { - uint64_t total_op_times = 0; - for (const auto & time : t.second) { - total_op_times += time; - } - std::cerr << t.first << ": " << t.second.size() << " x " << (total_op_times / t.second.size() / 1000.0) - << " us = " << (total_op_times / 1000.0) << " us"; - - // If we have as many flops entries as timing entries for the op, then compute and log the flops/S. - auto it = flops.find(t.first); - if (it != flops.end() && (it->second).size() == t.second.size()) { - uint64_t total_op_flops = 0; - for (const auto & elem : it->second) { - total_op_flops += elem; - } - std::cerr << " (" - << (double(total_op_flops) / (1000.0 * 1000.0 * 1000.0)) / - (double(total_op_times) / (1000.0 * 1000.0 * 1000.0)) - << " GFLOPS/s)"; - } - - total_all_op_times += total_op_times; - - std::cerr << std::endl; - } - - if (timings.size() > 0) { - std::cerr << "Total time: " << total_all_op_times / 1000.0 << " us." << std::endl; - } - - timings.clear(); - flops.clear(); - } - - std::string get_node_fusion_name(const ggml_tensor * node, const char *fusion_name, uint64_t *n_flops) { - *n_flops = 0; - std::string fusion_str; - if (fusion_name) { - fusion_str = fusion_name + std::string(" "); - } - if (node->op == GGML_OP_UNARY) { - return fusion_str + ggml_unary_op_name(ggml_get_unary_op(node)); - } - if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) { - const uint64_t m = node->ne[0]; - const uint64_t n = node->ne[1]; - const uint64_t k = node->src[1]->ne[0]; - const uint64_t batch = node->ne[2] * node->ne[3]; - std::string name = ggml_op_name(node->op); - if ((node->op == GGML_OP_MUL_MAT && n <= mul_mat_vec_max_cols) || - (node->op == GGML_OP_MUL_MAT_ID && node->src[2]->ne[1] == 1)) { - name += "_VEC"; - } - name += " "; - name += ggml_type_name(node->src[0]->type); - name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k); - if (node->op == GGML_OP_MUL_MAT_ID) { - name += " n_expert=" + std::to_string(node->src[0]->ne[2]); - } - if (batch > 1) { - name += " batch=" + std::to_string(batch); - } - name = fusion_str + name; - *n_flops = m * n * (k + (k - 1)) * batch; - return name; - } - if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) { - std::string name = ggml_op_name(node->op); - ggml_tensor * knl = node->src[0]; - uint64_t OW = node->ne[0]; - uint64_t OH = node->ne[1]; - uint64_t N = node->ne[3]; - uint64_t Cout = node->ne[2]; - uint64_t KW = knl->ne[0]; - uint64_t KH = knl->ne[1]; - uint64_t Cin = node->src[1]->ne[2]; - // KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ - uint64_t size_M = Cout; - uint64_t size_K = Cin * KW * KH; - uint64_t size_N = N * OW * OH; - *n_flops = size_M * size_N * (size_K + (size_K - 1)); - name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) + - ", N=N*OW*OH=" + std::to_string(size_N); - name = fusion_str + name; - return name; - } - if (node->op == GGML_OP_RMS_NORM) { - std::string name = ggml_op_name(node->op); - name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")"; - name = fusion_str + name; - return name; - } - if (node->op == GGML_OP_FLASH_ATTN_EXT) { - const ggml_tensor * dst = node; - const ggml_tensor * q = node->src[0]; - const ggml_tensor * k = node->src[1]; - const ggml_tensor * v = node->src[2]; - const ggml_tensor * m = node->src[3]; - std::stringstream name; - name << fusion_str; - name << ggml_op_name(node->op) << - " dst(" << dst->ne[0] << "," << dst->ne[1] << "," << dst->ne[2] << "," << dst->ne[3] << "), " << - " q(" << q->ne[0] << "," << q->ne[1] << "," << q->ne[2] << "," << q->ne[3] << "), " << - " k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " << - " v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " << - " m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")"; - *n_flops = 2ull * q->ne[1] * q->ne[2] * (k->ne[0] + v->ne[0]) * k->ne[1] * q->ne[3]; - return name.str(); - } - if (node->op == GGML_OP_TOP_K) { - std::stringstream name; - name << fusion_str; - name << ggml_op_name(node->op) << - " K=" << node->ne[0] << - " (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")"; - return name.str(); - } - return fusion_str + ggml_op_name(node->op); - } - - void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) { - uint64_t n_flops; - std::string name = get_node_fusion_name(node, fusion_name, &n_flops); - if (n_flops) { - flops[name].push_back(n_flops); - } - timings[name].push_back(time); - } - - void log_timing(const std::vector &nodes, const std::vector &names, uint64_t time) { - uint64_t total_flops = 0; - std::string name; - for (size_t n = 0; n < nodes.size(); ++n) { - uint64_t n_flops = 0; - name += get_node_fusion_name(nodes[n], names[n], &n_flops); - total_flops += n_flops; - - if (n != nodes.size() - 1) { - name += ", "; - } - } - if (total_flops) { - flops[name].push_back(total_flops); - } - timings[name].push_back(time); - } - - private: - std::map> timings; - std::map> flops; - uint32_t print_count {}; -}; - -struct ggml_backend_vk_context { - std::string name; - - vk_device device; - - size_t semaphore_idx, event_idx; - ggml_vk_garbage_collector gc; - size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k, prealloc_size_add_rms_partials, prealloc_size_add_rms_partials_offset; - vk_buffer prealloc_x, prealloc_y, prealloc_split_k, prealloc_add_rms_partials, sync_staging; - vk::Fence fence, almost_ready_fence; - bool submit_pending {}; - bool almost_ready_fence_pending {}; - // Set before op_add and unset after op_rms_norm to indicate that the add should - // write partial sums to accumulate the square of the vector components - bool do_add_rms_partials_offset_calculation; - bool do_add_rms_partials; - - uint64_t last_total_mul_mat_bytes {}; - - // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert. - vk_pipeline_struct * prealloc_y_last_pipeline_used {}; - const ggml_tensor * prealloc_y_last_tensor_used {}; - // True when prealloc_y holds the padded fp16 layout used by the coopmat2 B decode-vector callback. - // If false, then it's contiguous. - bool prealloc_y_last_decode_vector_staging {}; - - // Track which nodes have been used since the last sync, and whether they were written to - std::vector unsynced_nodes_written; - std::vector unsynced_nodes_read; - // Track which prealloc buffers have pending reads that need to be synchronized. - // These are checked before writing to the buffer (and call ggml_vk_sync_buffers if set), - // and set to true after the buffer contents are consumed. - bool prealloc_x_need_sync, prealloc_y_need_sync, prealloc_split_k_need_sync; - - vk_context_ref compute_ctx; - - vk_context_ref transfer_ctx; - vk_semaphore transfer_semaphore; - uint64_t transfer_semaphore_last_submitted {}; - - std::vector tensor_ctxs; - - std::vector descriptor_pools; - std::vector descriptor_sets; - uint32_t descriptor_set_idx {}; - uint32_t pipeline_descriptor_set_requirements {}; - - vk_command_pool compute_cmd_pool; - vk_command_pool transfer_cmd_pool; - - // number of additional consecutive nodes that are being fused with the - // node currently being processed - int num_additional_fused_ops {}; - // Bitmask of which fused ops need to write an intermediate value to memory. - // Bit 'i' means nodes[start_of_fusion + i] writes to memory. - // If there's no fusion, bit 0 is still set. - int fused_ops_write_mask {}; - topk_moe_mode fused_topk_moe_mode {}; - bool fused_topk_moe_scale {}; - - // for GGML_VK_PERF_LOGGER - std::unique_ptr perf_logger; - vk::QueryPool query_pool; - std::vector query_fusion_names; - std::vector query_fusion_node_count; - std::vector query_nodes; - std::vector query_node_idx; - int32_t num_queries {}; - int32_t query_idx {}; -}; - -static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT - -static uint64_t vk_tensor_offset(const ggml_tensor * tensor) { - if (tensor->view_src) { - return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base; - } - return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; -} - -static uint32_t get_misalign_bytes(const ggml_backend_vk_context * ctx, const ggml_tensor * t) -{ - return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));; -} - -template void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - GGML_UNUSED(p); - GGML_UNUSED(src0); - GGML_UNUSED(src1); - GGML_UNUSED(src2); - GGML_UNUSED(src3); - GGML_UNUSED(dst); - static_assert(!std::is_const::value, "unexpected type"); - GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0); - GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0); - GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0); - GGML_ASSERT(!src3 || get_misalign_bytes(ctx, src3) == 0); - GGML_ASSERT(!dst || get_misalign_bytes(ctx, dst) == 0); -} - -template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_mat_vec_p021_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); - - p.b_offset = b_offset; - p.d_offset = d_offset; - - GGML_UNUSED(src0); - GGML_UNUSED(src2); - GGML_UNUSED(src3); -} - -template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_mat_vec_nc_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); - - p.b_offset = b_offset; - p.d_offset = d_offset; - - GGML_UNUSED(src0); - GGML_UNUSED(src2); - GGML_UNUSED(src3); -} - -template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_fwht_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - p.src_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); - p.dst_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); - - GGML_UNUSED(src1); - GGML_UNUSED(src2); - GGML_UNUSED(src3); -} - -struct ggml_backend_vk_buffer_context { - vk_device_ref device; - vk_buffer dev_buffer; - std::string name; - - ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) : - device(device), - dev_buffer(dev_buffer), - name(name) { - } - - ~ggml_backend_vk_buffer_context() { - ggml_vk_destroy_buffer(dev_buffer); - } -}; - -void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) { - if (!vk_memory_logger_enabled) { - return; - } - std::lock_guard guard(log_mutex); - vk_buffer buf = buf_ref.lock(); - const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); - const std::string type = device ? "device" : "host"; - allocations[buf->buffer] = size; - total_device += device ? size : 0; - total_host += device ? 0 : size; - VK_LOG_MEMORY(buf->device->name << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); -} - -void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) { - if (buf_ref.expired() || buf_ref.lock()->size == 0 || !vk_memory_logger_enabled) { - return; - } - - std::lock_guard guard(log_mutex); - vk_buffer buf = buf_ref.lock(); - const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); - std::string type = device ? "device" : "host"; - auto it = allocations.find(buf->buffer); - if (it != allocations.end()) { - total_device -= device ? it->second : 0; - total_host -= device ? 0 : it->second; - VK_LOG_MEMORY(buf->device->name << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); - allocations.erase(it); - } else { - VK_LOG_MEMORY("ERROR " << buf->device->name << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer); - } -} - -struct vk_instance_t { - vk::Instance instance; - - bool debug_utils_support = false; // VK_EXT_debug_utils enabled - PFN_vkSetDebugUtilsObjectNameEXT pfn_vkSetDebugUtilsObjectNameEXT = {}; - PFN_vkQueueBeginDebugUtilsLabelEXT pfn_vkQueueBeginDebugUtilsLabelEXT = {}; - PFN_vkQueueEndDebugUtilsLabelEXT pfn_vkQueueEndDebugUtilsLabelEXT = {}; - PFN_vkCmdBeginDebugUtilsLabelEXT pfn_vkCmdBeginDebugUtilsLabelEXT = {}; - PFN_vkCmdEndDebugUtilsLabelEXT pfn_vkCmdEndDebugUtilsLabelEXT = {}; - PFN_vkCmdInsertDebugUtilsLabelEXT pfn_vkCmdInsertDebugUtilsLabelEXT = {}; - - std::vector device_indices; - std::vector device_supports_membudget; - vk_device devices[GGML_VK_MAX_DEVICES]; -}; - -static bool vk_instance_initialized = false; -static vk_instance_t vk_instance; - -#ifdef GGML_VULKAN_CHECK_RESULTS -static size_t vk_skip_checks; -static size_t vk_output_tensor; - -static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name); -static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx); -static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx); -#endif - -typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); - -static void ggml_backend_vk_free(ggml_backend_t backend); - -static VkDeviceSize ggml_vk_get_max_buffer_range(const ggml_backend_vk_context * ctx, const vk_buffer &buf, const VkDeviceSize offset) { - const VkDeviceSize range = std::min(VkDeviceSize{buf->size - offset}, - VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange}); - return range; -} - -// Wait for ctx->fence to be signaled. -static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) { - // Use waitForFences while most of the graph executes. Hopefully the CPU can sleep - // during this wait. - if (ctx->almost_ready_fence_pending) { - VK_CHECK(ctx->device->device.waitForFences({ ctx->almost_ready_fence }, true, UINT64_MAX), "almost_ready_fence"); - ctx->device->device.resetFences({ ctx->almost_ready_fence }); - ctx->almost_ready_fence_pending = false; - } - - // Spin (w/pause) waiting for the graph to finish executing. - vk::Result result; - while ((result = ctx->device->device.getFenceStatus(ctx->fence)) != vk::Result::eSuccess) { - if (result != vk::Result::eNotReady) { - fprintf(stderr, "ggml_vulkan: error %s at %s:%d\n", to_string(result).c_str(), __FILE__, __LINE__); - exit(1); - } - for (uint32_t i = 0; i < 100; ++i) { - YIELD(); - YIELD(); - YIELD(); - YIELD(); - YIELD(); - YIELD(); - YIELD(); - YIELD(); - YIELD(); - YIELD(); - } - } - ctx->device->device.resetFences({ ctx->fence }); -} - -static constexpr uint32_t kSpvOpCooperativeMatrixLoadTensorNV = 5367; -static constexpr uint32_t kSpvCapabilityCooperativeMatrixDecodeVectorNV = 5447; -static constexpr uint32_t kSpvTensorAddressingDecodeVectorFuncBit = 0x4; - -// Remove SPV_NV_cooperative_matrix_decode_vector usage from a SPIR-V module so it -// can be loaded on drivers that only support SPV_NV_cooperative_matrix2. Drops the -// OpExtension declaration, the CooperativeMatrixDecodeVectorNV OpCapability, and the -// DecodeVectorFunc operand from any OpCooperativeMatrixLoadTensorNV instruction. -// Returns true when the input used the extension (and `out` was populated with a -// stripped copy); returns false otherwise without touching `out`. -static bool ggml_vk_strip_decode_vector(const uint32_t * code, size_t word_count, std::vector & out) { - static const char kDecodeVectorExt[] = "SPV_NV_cooperative_matrix_decode_vector"; - - if (word_count < 5) { - return false; - } - - bool uses_decode_vector = false; - for (size_t pos = 5; pos < word_count; ) { - uint32_t word = code[pos]; - uint32_t wc = word >> spv::WordCountShift; - uint32_t op = word & spv::OpCodeMask; - GGML_ASSERT(wc > 0 && pos + wc <= word_count); - if (op == spv::OpExtension && wc >= 2) { - const char * s = reinterpret_cast(&code[pos + 1]); - if (strcmp(s, kDecodeVectorExt) == 0) { - uses_decode_vector = true; - break; - } - } - pos += wc; - } - - if (!uses_decode_vector) { - return false; - } - - VK_LOG_DEBUG("ggml_vk_strip_decode_vector: stripping SPV_NV_cooperative_matrix_decode_vector"); - - // Bulk-copy unchanged runs and only break the run when an instruction needs to - // be dropped or patched. Use reserve + insert/push_back so the destination buffer - // is touched exactly once (no zero-initialization pass from resize()). - out.clear(); - out.reserve(word_count); - - size_t run_start = 0; - auto flush_run = [&](size_t up_to) { - if (up_to > run_start) { - out.insert(out.end(), code + run_start, code + up_to); - } - }; - - for (size_t pos = 5; pos < word_count; ) { - uint32_t word = code[pos]; - uint32_t wc = word >> spv::WordCountShift; - uint32_t op = word & spv::OpCodeMask; - GGML_ASSERT(wc > 0 && pos + wc <= word_count); - - if (op == spv::OpExtension && wc >= 2) { - const char * s = reinterpret_cast(&code[pos + 1]); - if (strcmp(s, kDecodeVectorExt) == 0) { - flush_run(pos); - pos += wc; - run_start = pos; - continue; - } - } - - if (op == spv::OpCapability && wc == 2 && code[pos + 1] == kSpvCapabilityCooperativeMatrixDecodeVectorNV) { - flush_run(pos); - pos += wc; - run_start = pos; - continue; - } - - if (op == kSpvOpCooperativeMatrixLoadTensorNV) { - // [opcode/wc][ResultType][Result][Pointer][Object][TensorLayout][MemOperand mask][mem extras...][TA mask][ta extras...] - GGML_ASSERT(wc >= 8); - - uint32_t mem_mask = code[pos + 6]; - size_t cur = pos + 7; - // Each of these MemoryAccess bits (when set) carries one trailing operand. - cur += (mem_mask & 0x2) ? 1 : 0; // Aligned - cur += (mem_mask & 0x8) ? 1 : 0; // MakePointerAvailable - cur += (mem_mask & 0x10) ? 1 : 0; // MakePointerVisible - cur += (mem_mask & 0x10000) ? 1 : 0; // AliasScopeINTELMask - cur += (mem_mask & 0x20000) ? 1 : 0; // NoAliasINTELMask - GGML_ASSERT(cur < pos + wc); - - uint32_t ta_mask = code[cur]; - if ((ta_mask & kSpvTensorAddressingDecodeVectorFuncBit) == 0) { - pos += wc; - continue; // leave instruction inside the current unchanged run - } - - flush_run(pos); - - // Append unchanged prefix of the instruction (header through the mem-extras). - size_t inst_start = out.size(); - size_t pre_n = cur - pos; - out.insert(out.end(), code + pos, code + pos + pre_n); - - // Emit TA mask with the DecodeVectorFunc bit cleared. - out.push_back(ta_mask & ~kSpvTensorAddressingDecodeVectorFuncBit); - - // TA extras: TensorView (0x1) and DecodeFunc (0x2) are kept verbatim; - // DecodeVectorFunc (0x4) is dropped along with its trailing id operand. - size_t keep_ta_extras = ((ta_mask & 0x1) ? 1 : 0) + ((ta_mask & 0x2) ? 1 : 0); - if (keep_ta_extras) { - out.insert(out.end(), code + cur + 1, code + cur + 1 + keep_ta_extras); - } - - GGML_ASSERT(wc == pre_n + 1 + keep_ta_extras + 1); - - // Patch the instruction header with the new (one-shorter) word count. - uint32_t new_wc = wc - 1; - out[inst_start] = (new_wc << spv::WordCountShift) | op; - - pos += wc; - run_start = pos; - continue; - } - - pos += wc; - } - - flush_run(word_count); - return true; -} - -static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, size_t spv_size, const void* spv_data, const std::string entrypoint, - uint32_t parameter_count, std::array wg_denoms, std::vector specialization_constants, - bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) { - VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << pipeline->name << ", " << entrypoint << ", " << parameter_count << - ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << - disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")"); - GGML_ASSERT(parameter_count > 0); - GGML_ASSERT(parameter_count <= MAX_PARAMETER_COUNT); - GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT - - vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast(spv_data)); - - // Patch SPIR-V to enable RTE rounding for FP16, avoiding the need for - // separate shader variants compiled with -DRTE16. - std::vector spirv; - if (device->float_controls_rte_fp16) { - const uint32_t* spv_words = reinterpret_cast(spv_data); - size_t word_count = spv_size / sizeof(uint32_t); - spirv.assign(spv_words, spv_words + word_count); - - // Find insertion points respecting SPIR-V layout order: - // Header(5) -> OpCapability -> OpExtension -> ... -> OpEntryPoint -> OpExecutionMode -> ... - size_t pos = 5; // skip header - size_t cap_insert_pos = pos; - size_t ext_insert_pos = pos; - size_t exec_insert_pos = pos; - uint32_t entry_point_id = 0; - - while (pos < spirv.size()) { - uint32_t opcode = spirv[pos] & spv::OpCodeMask; - uint32_t len = spirv[pos] >> spv::WordCountShift; - if (len == 0) break; - - if (opcode == spv::OpCapability) { - cap_insert_pos = pos + len; - ext_insert_pos = pos + len; - } else if (opcode == spv::OpExtension) { - ext_insert_pos = pos + len; - } else if (opcode == spv::OpEntryPoint) { - entry_point_id = spirv[pos + 2]; - exec_insert_pos = pos + len; - } else if (opcode == spv::OpExecutionMode || opcode == spv::OpExecutionModeId) { - exec_insert_pos = pos + len; - } else if (entry_point_id != 0) { - break; - } - - pos += len; - } - - // Insert from latest position first so earlier indices stay valid. - - // OpExecutionMode %entrypoint RoundingModeRTE 16 - uint32_t exec_mode[] = { (4u << spv::WordCountShift) | spv::OpExecutionMode, entry_point_id, spv::ExecutionModeRoundingModeRTE, 16 }; - spirv.insert(spirv.begin() + exec_insert_pos, std::begin(exec_mode), std::end(exec_mode)); - - // OpExtension "SPV_KHR_float_controls" - const char ext_str[] = "SPV_KHR_float_controls"; - size_t ext_str_words = CEIL_DIV(sizeof(ext_str), sizeof(uint32_t)); - std::vector extension(1 + ext_str_words, 0); - extension[0] = (uint32_t)((1 + ext_str_words) << spv::WordCountShift) | spv::OpExtension; - memcpy(&extension[1], ext_str, sizeof(ext_str)); - spirv.insert(spirv.begin() + ext_insert_pos, extension.begin(), extension.end()); - - // OpCapability RoundingModeRTE - uint32_t capability[] = { (2u << spv::WordCountShift) | spv::OpCapability, spv::CapabilityRoundingModeRTE }; - spirv.insert(spirv.begin() + cap_insert_pos, std::begin(capability), std::end(capability)); - - shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data()); - } - -#if defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT) - if (device->coopmat2 && !device->coopmat2_decode_vector) { - const uint32_t * src = spirv.empty() ? reinterpret_cast(spv_data) : spirv.data(); - size_t src_n = spirv.empty() ? spv_size / sizeof(uint32_t) : spirv.size(); - std::vector stripped; - if (ggml_vk_strip_decode_vector(src, src_n, stripped)) { - spirv = std::move(stripped); - shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data()); - } - } -#endif - - pipeline->shader_module = device->device.createShaderModule(shader_module_create_info); - - vk::PushConstantRange pcr( - vk::ShaderStageFlagBits::eCompute, - 0, - pipeline->push_constant_size - ); - - vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), device->dsl, pcr); - pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info); - - std::vector specialization_entries(specialization_constants.size()); - - for (size_t i = 0; i < specialization_constants.size(); i++) { - specialization_entries[i].constantID = i; - specialization_entries[i].offset = i * sizeof(uint32_t); - specialization_entries[i].size = sizeof(uint32_t); - } - - vk::SpecializationInfo specialization_info( - specialization_entries.size(), - specialization_entries.data(), - specialization_constants.size() * sizeof(uint32_t), - specialization_constants.data() - ); - - vk::PipelineShaderStageCreateFlags pipeline_shader_stage_create_flags{}; - - if (device->subgroup_require_full_support && require_full_subgroups) { - pipeline_shader_stage_create_flags |= vk::PipelineShaderStageCreateFlagBits::eRequireFullSubgroupsEXT; - } - - vk::PipelineShaderStageCreateInfo pipeline_shader_create_info( - pipeline_shader_stage_create_flags, - vk::ShaderStageFlagBits::eCompute, - pipeline->shader_module, - entrypoint.c_str(), - &specialization_info); - - vk::PipelineShaderStageRequiredSubgroupSizeCreateInfoEXT pipeline_shader_stage_required_subgroup_size_create_info; - pipeline_shader_stage_required_subgroup_size_create_info.requiredSubgroupSize = required_subgroup_size; - if (device->subgroup_size_control && required_subgroup_size > 0) { - GGML_ASSERT(device->subgroup_min_size <= required_subgroup_size && required_subgroup_size <= device->subgroup_max_size); - pipeline_shader_create_info.setPNext(&pipeline_shader_stage_required_subgroup_size_create_info); - } - - vk::ComputePipelineCreateInfo compute_pipeline_create_info( - device->pipeline_executable_properties_support ? - vk::PipelineCreateFlagBits::eCaptureStatisticsKHR : - vk::PipelineCreateFlags{}, - pipeline_shader_create_info, - pipeline->layout); - - vk::PipelineRobustnessCreateInfoEXT rci; - - if (device->pipeline_robustness && disable_robustness) { - rci.storageBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled; - rci.uniformBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled; - compute_pipeline_create_info.setPNext(&rci); - } - -#if defined(VK_EXT_shader_64bit_indexing) - vk::PipelineCreateFlags2CreateInfo pipelineFlags2CreateInfo; - if (pipeline->is_64b_indexing) - { - pipelineFlags2CreateInfo.flags = vk::PipelineCreateFlagBits2::e64BitIndexingEXT; - if (device->pipeline_executable_properties_support) { - pipelineFlags2CreateInfo.flags |= vk::PipelineCreateFlagBits2::eCaptureStatisticsKHR; - } - pipelineFlags2CreateInfo.setPNext(compute_pipeline_create_info.pNext); - compute_pipeline_create_info.setPNext(&pipelineFlags2CreateInfo); - } -#endif - - try { - pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; - } catch (const vk::SystemError& e) { - std::cerr << "ggml_vulkan: Compute pipeline creation failed for " << pipeline->name << std::endl; - std::cerr << "ggml_vulkan: " << e.what() << std::endl; - throw e; - } - - if (vk_instance.debug_utils_support) { - vk::DebugUtilsObjectNameInfoEXT duoni; - duoni.objectType = vk::ObjectType::ePipeline; - duoni.pObjectName = pipeline->name.c_str(); - duoni.objectHandle = /*reinterpret_cast*/(uint64_t)(static_cast(pipeline->pipeline)); - vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast(duoni)); - } - - if (device->pipeline_executable_properties_support) { - vk::PipelineExecutableInfoKHR executableInfo; - executableInfo.pipeline = pipeline->pipeline; - - auto statistics = device->device.getPipelineExecutableStatisticsKHR(executableInfo); - - bool print_stats = !vk_pipeline_stats_filter.empty() && - pipeline->name.find(vk_pipeline_stats_filter) != std::string::npos; - if (print_stats) { - std::cerr << "ggml_vulkan: pipeline stats for " << pipeline->name << ":" << std::endl; - } - - for (auto & s : statistics) { - if (print_stats) { - std::cerr << "ggml_vulkan: " << s.name.data() << ": "; - switch (s.format) { - case vk::PipelineExecutableStatisticFormatKHR::eBool32: - std::cerr << (s.value.b32 ? "true" : "false"); - break; - case vk::PipelineExecutableStatisticFormatKHR::eInt64: - std::cerr << s.value.i64; - break; - case vk::PipelineExecutableStatisticFormatKHR::eUint64: - std::cerr << s.value.u64; - break; - case vk::PipelineExecutableStatisticFormatKHR::eFloat64: - std::cerr << s.value.f64; - break; - } - std::cerr << std::endl; - } - // "Register Count" is reported by NVIDIA drivers. - if (strcmp(s.name, "Register Count") == 0) { - VK_LOG_DEBUG(pipeline->name << " " << s.name << ": " << s.value.u64 << " registers"); - pipeline->register_count = (uint32_t)s.value.u64; - } - } - } - - { - std::lock_guard guard(device->compile_mutex); - device->all_pipelines.push_back(pipeline); - pipeline->compiled = true; - pipeline->compile_pending = false; - } - device->compile_cv.notify_all(); -} - -static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) { - VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")"); - device.destroyPipelineLayout(pipeline->layout); - - device.destroyShaderModule(pipeline->shader_module); - - device.destroyPipeline(pipeline->pipeline); -} - -static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) { - VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")"); - ctx->pipeline_descriptor_set_requirements += n; - if (!pipeline->compiled) { - ggml_vk_load_shaders(ctx->device, pipeline); - } - ggml_pipeline_allocate_descriptor_sets(ctx); -} - -static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) { - - if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) { - // Enough descriptors are available - return; - } - - vk_device& device = ctx->device; - - // Grow by 50% to avoid frequent allocations - uint32_t needed = std::max(3 * ctx->descriptor_sets.size() / 2, size_t{ctx->pipeline_descriptor_set_requirements}); - uint32_t to_alloc = needed - ctx->descriptor_sets.size(); - uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE; - uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE; - - while (to_alloc > 0) { - const uint32_t alloc_count = std::min(pool_remaining, to_alloc); - to_alloc -= alloc_count; - pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE; - - if (pool_idx >= ctx->descriptor_pools.size()) { - vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE); - vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size); - ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info)); - } - - std::vector layouts(alloc_count); - for (uint32_t i = 0; i < alloc_count; i++) { - layouts[i] = device->dsl; - } - vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data()); - std::vector sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info); - ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end()); - - pool_idx++; - } -} - -static vk_command_buffer* ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) { - VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()"); - vk::CommandBufferAllocateInfo command_buffer_alloc_info( - p.pool, - vk::CommandBufferLevel::ePrimary, - 1); - const std::vector cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info); - p.cmd_buffers.push_back({ cmd_buffers.front(), 0, true }); - return &p.cmd_buffers[p.cmd_buffers.size()-1]; -} - -static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) { - if (ctx->seqs.empty()) { - if (fence) { - std::lock_guard guard(queue_mutex); - ctx->p->q->queue.submit({}, fence); - } - return; - } - VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")"); - - std::vector> tl_wait_vals; - std::vector> tl_signal_vals; - std::vector> tl_wait_semaphores; - std::vector> tl_signal_semaphores; - std::vector tl_submit_infos; - std::vector submit_infos; - int idx = -1; - std::vector> stage_flags; - - size_t reserve = 0; - - for (const auto& sequence : ctx->seqs) { - reserve += sequence.size(); - } - - // Pre-reserve vectors to prevent reallocation, which invalidates pointers - tl_wait_semaphores.reserve(reserve); - tl_wait_vals.reserve(reserve); - tl_signal_semaphores.reserve(reserve); - tl_signal_vals.reserve(reserve); - tl_submit_infos.reserve(reserve); - submit_infos.reserve(reserve); - stage_flags.reserve(reserve); - - for (const auto& sequence : ctx->seqs) { - for (const auto& submission : sequence) { - stage_flags.push_back({}); - idx++; - tl_wait_vals.push_back({}); - tl_wait_semaphores.push_back({}); - tl_signal_vals.push_back({}); - tl_signal_semaphores.push_back({}); - for (size_t i = 0; i < submission.wait_semaphores.size(); i++) { - stage_flags[idx].push_back(ctx->p->q->stage_flags); - tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value); - tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s); - } - for (size_t i = 0; i < submission.signal_semaphores.size(); i++) { - tl_signal_vals[idx].push_back(submission.signal_semaphores[i].value); - tl_signal_semaphores[idx].push_back(submission.signal_semaphores[i].s); - } - tl_submit_infos.push_back({ - (uint32_t) submission.wait_semaphores.size(), - tl_wait_vals[idx].data(), - (uint32_t) submission.signal_semaphores.size(), - tl_signal_vals[idx].data(), - }); - tl_submit_infos[idx].sType = vk::StructureType::eTimelineSemaphoreSubmitInfo; - tl_submit_infos[idx].pNext = nullptr; - vk::SubmitInfo si{ - (uint32_t) submission.wait_semaphores.size(), - tl_wait_semaphores[idx].data(), - stage_flags[idx].data(), - 1, - &submission.buffer->buf, - (uint32_t) submission.signal_semaphores.size(), - tl_signal_semaphores[idx].data(), - }; - si.setPNext(&tl_submit_infos[idx]); - submit_infos.push_back(si); - } - } - - std::lock_guard guard(queue_mutex); - ctx->p->q->queue.submit(submit_infos, fence); - - ctx->seqs.clear(); -} - -static uint32_t ggml_vk_find_queue_family_index(std::vector& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) { - VK_LOG_DEBUG("ggml_vk_find_queue_family_index()"); - const uint32_t qfsize = queue_family_props.size(); - - // Try with avoid preferences first - for (uint32_t i = 0; i < qfsize; i++) { - if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required && !(queue_family_props[i].queueFlags & avoid)) { - return i; - } - } - - // Fall back to only required - for (size_t i = 0; i < qfsize; i++) { - if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required) { - return i; - } - } - - // Fall back to reusing compute queue - for (size_t i = 0; i < qfsize; i++) { - if (queue_family_props[i].queueCount >= min_num_queues && queue_family_props[i].queueFlags & required) { - return i; - } - } - - // Fall back to ignoring min_num_queries - for (size_t i = 0; i < qfsize; i++) { - if (queue_family_props[i].queueFlags & required) { - return i; - } - } - - // All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations. - // Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional. - if (compute_index >= 0) { - return compute_index; - } - - std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl; - - for(auto &q_family : queue_family_props) { - std::cerr << "Queue number: " + std::to_string(q_family.queueCount) << " flags: " + to_string(q_family.queueFlags) << std::endl; - } - abort(); -} - -static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags, bool transfer_only) { - VK_LOG_DEBUG("ggml_vk_create_queue()"); - std::lock_guard guard(device->mutex); - - q.queue_family_index = queue_family_index; - q.transfer_only = transfer_only; - - q.cmd_pool.init(device, &q); - - q.queue = device->device.getQueue(queue_family_index, queue_index); - - q.stage_flags = stage_flags; -} - -static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) { - vk_context result = std::make_shared(); - VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")"); - ctx->gc.contexts.emplace_back(result); - result->p = &p; - return result; -} - -static vk_context ggml_vk_create_temporary_context(vk_command_pool& p) { - vk_context result = std::make_shared(); - VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")"); - result->p = &p; - return result; -} - -static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) { - VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()"); - vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 }; - vk::SemaphoreCreateInfo ci{}; - ci.setPNext(&tci); - vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci); - ctx->gc.semaphores.push_back({ semaphore, 0 }); - return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1]; -} - -static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) { - VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()"); - if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) { - vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; - vk::SemaphoreCreateInfo ci{}; - ci.setPNext(&tci); - vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci); - ctx->gc.tl_semaphores.push_back({ semaphore, 0 }); - } - return &ctx->gc.tl_semaphores[ctx->semaphore_idx++]; -} - -static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) { - if (ctx->event_idx >= ctx->gc.events.size()) { - ctx->gc.events.push_back(ctx->device->device.createEvent({})); - } - return ctx->gc.events[ctx->event_idx++]; -} - -static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) { - VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()"); - - // Requires command buffers to be done - device->device.resetCommandPool(p.pool); - // Don't clear the command buffers and mark them as not in use. - // This allows us to reuse them - for (auto& cmd_buffer : p.cmd_buffers) { - cmd_buffer.in_use = false; - } -} - -static void ggml_vk_queue_command_pools_cleanup(vk_device& device) { - VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()"); - - // Arbitrary frequency to cleanup/reuse command buffers - static constexpr uint32_t cleanup_frequency = 10; - - if (device->compute_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) { - ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool); - } - if (device->transfer_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) { - ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool); - } -} - -static std::vector ggml_vk_find_memory_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) { - std::vector indices; - - for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) { - vk::MemoryType memory_type = mem_props->memoryTypes[i]; - if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) && - (flags & memory_type.propertyFlags) == flags && - mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) { - indices.push_back(i); - } - } - return indices; -} - -static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list & req_flags_list, - void *import_ptr = nullptr) { - VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")"); - if (size > device->max_buffer_size) { - throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device buffer size limit"); - } - - vk_buffer buf = std::make_shared(); - - if (size == 0) { - buf->size = 0; - return buf; - } - - vk::BufferUsageFlags usage_flags = vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst; - vk::MemoryAllocateFlags mem_flags {}; - if (device->buffer_device_address) { - usage_flags |= vk::BufferUsageFlagBits::eShaderDeviceAddress; - mem_flags |= vk::MemoryAllocateFlagBits::eDeviceAddress; - } - - vk::BufferCreateInfo buffer_create_info{ - vk::BufferCreateFlags(), - size, - usage_flags, - vk::SharingMode::eExclusive, - 0, - nullptr, - }; - - vk::ExternalMemoryBufferCreateInfo external_memory_bci; - if (import_ptr) { - external_memory_bci.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT; - buffer_create_info.setPNext(&external_memory_bci); - } - - buf->buffer = device->device.createBuffer(buffer_create_info); - - vk::MemoryRequirements mem_req = device->device.getBufferMemoryRequirements(buf->buffer); - - vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties(); - - const vk::MemoryPriorityAllocateInfoEXT mem_priority_info { 1.0f }; - - vk::MemoryAllocateFlagsInfo mem_flags_info { mem_flags }; - - if (device->memory_priority) { - mem_flags_info.setPNext(&mem_priority_info); - } - - if (import_ptr) { - vk::MemoryHostPointerPropertiesEXT host_pointer_props; - try { - host_pointer_props = device->device.getMemoryHostPointerPropertiesEXT(vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, import_ptr); - } catch (vk::SystemError& e) { - GGML_LOG_WARN("ggml_vulkan: Failed getMemoryHostPointerPropertiesEXT (%s)\n", e.what()); - device->device.destroyBuffer(buf->buffer); - return {}; - } - vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties(); - - uint32_t memory_type_idx; - vk::MemoryPropertyFlags property_flags = *req_flags_list.begin(); - for (memory_type_idx = 0; memory_type_idx < 32; ++memory_type_idx) { - if (!(host_pointer_props.memoryTypeBits & (1u << memory_type_idx))) { - continue; - } - if (!(mem_req.memoryTypeBits & (1u << memory_type_idx))) { - continue; - } - - vk::MemoryType memory_type = mem_props.memoryTypes[memory_type_idx]; - // check for visible+coherent+cached. Other flags (e.g. devicelocal) are allowed - if ((memory_type.propertyFlags & property_flags) == property_flags) { - property_flags = memory_type.propertyFlags; - break; - } - } - if (memory_type_idx == 32) { - GGML_LOG_WARN("ggml_vulkan: Memory type for host allocation not found\n"); - device->device.destroyBuffer(buf->buffer); - return {}; - } - - buf->memory_property_flags = mem_props.memoryTypes[memory_type_idx].propertyFlags; - try { - vk::ImportMemoryHostPointerInfoEXT import_info; - import_info.handleType = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT; - import_info.pHostPointer = import_ptr; - import_info.setPNext(&mem_flags_info); - buf->device_memory = device->device.allocateMemory({ size, memory_type_idx, &import_info }); - } catch (const vk::SystemError& e) { - } - } else { - for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) { - const auto & req_flags = *it; - - const std::vector memory_type_indices = ggml_vk_find_memory_properties(&mem_props, &mem_req, req_flags); - - if (memory_type_indices.empty()) { - continue; - } - - bool done = false; - - for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) { - try { - buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info }); - buf->memory_property_flags = mem_props.memoryTypes[*mtype_it].propertyFlags; - done = true; - break; - } catch (const vk::SystemError& e) { - // loop and retry - // during last attempt throw the exception - if (it + 1 == req_flags_list.end() && mtype_it + 1 == memory_type_indices.end()) { - device->device.destroyBuffer(buf->buffer); - throw e; - } - } - } - - if (done) { - break; - } - } - } - - if (!buf->device_memory) { - device->device.destroyBuffer(buf->buffer); - throw vk::OutOfDeviceMemoryError("No suitable memory type found"); - } - - buf->ptr = nullptr; - - if (import_ptr) { - buf->ptr = import_ptr; - } else { - if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { - buf->ptr = device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE); - } - } - - device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0); - - buf->device = device; - buf->size = size; - - if (device->buffer_device_address) { - const vk::BufferDeviceAddressInfo addressInfo(buf->buffer); - buf->bda_addr = device->device.getBufferAddress(addressInfo); - } - - device->memory_logger->log_allocation(buf, size); - - return buf; -} - -static vk_buffer ggml_vk_create_buffer_check(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) { - try { - return ggml_vk_create_buffer(device, size, {req_flags, fallback_flags}); - } catch (const vk::SystemError& e) { - std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl; - std::cerr << "ggml_vulkan: " << e.what() << std::endl; - throw e; - } -} - -static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) { - vk_buffer buf; - try { - if (device->prefer_host_memory) { - buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, - vk::MemoryPropertyFlagBits::eDeviceLocal}); - } else if (device->uma) { - // On UMA, prefer host-visible memory so direct tensor borrowing works. - // If unavailable, fall back to device-local memory. - buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, - vk::MemoryPropertyFlagBits::eDeviceLocal, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); - } else if (device->disable_host_visible_vidmem) { - if (device->allow_sysmem_fallback) { - buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); - } else { - buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal}); - } - } else { - // use rebar if available, otherwise fallback to device only visible memory - if (device->allow_sysmem_fallback) { - buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, - vk::MemoryPropertyFlagBits::eDeviceLocal, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); - } else { - buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, - vk::MemoryPropertyFlagBits::eDeviceLocal}); - } - } - } catch (const vk::SystemError& e) { - std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl; - std::cerr << "ggml_vulkan: " << e.what() << std::endl; - throw e; - } - - return buf; -} - -static void ggml_vk_destroy_buffer(vk_buffer& buf) { - if (buf == nullptr) { - return; - } - - if (buf->device != nullptr) { - buf->device->memory_logger->log_deallocation(buf); - } - - buf.reset(); -} - -static vk_subbuffer ggml_vk_subbuffer(const ggml_backend_vk_context* ctx, const vk_buffer& buf, size_t offset = 0) { - return { buf, offset, ggml_vk_get_max_buffer_range(ctx, buf, offset) }; -} - -static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx) { - VK_LOG_DEBUG("ggml_vk_sync_buffers()"); - - const bool transfer_queue = subctx->p->q->transfer_only; - - if (ctx) { - ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false; - } - - subctx->s->buffer->buf.pipelineBarrier( - subctx->p->q->stage_flags, - subctx->p->q->stage_flags, - {}, - { { - { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) }, - { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) } - } }, - {}, - {} - ); -} - -static void ggml_vk_reset_event(vk_context& ctx, vk::Event& event) { - VK_LOG_DEBUG("ggml_vk_set_event()"); - - ctx->s->buffer->buf.resetEvent( - event, - ctx->p->q->stage_flags - ); -} - -static void ggml_vk_set_event(vk_context& ctx, vk::Event& event) { - VK_LOG_DEBUG("ggml_vk_set_event()"); - - ctx->s->buffer->buf.setEvent( - event, - ctx->p->q->stage_flags - ); -} - -static void ggml_vk_wait_events(vk_context& ctx, std::vector&& events) { - VK_LOG_DEBUG("ggml_vk_wait_events()"); - if (events.empty()) { - return; - } - - ctx->s->buffer->buf.waitEvents( - events, - ctx->p->q->stage_flags, - ctx->p->q->stage_flags, - {}, - {}, - {} - ); -} - -struct vk_fa_tuning_params { - FaCodePath path; - uint32_t workgroup_size; - uint32_t subgroup_size; - uint32_t block_rows; - uint32_t block_cols; - uint32_t d_split; - uint32_t row_split; - bool shmem_staging; - bool disable_subgroups; - uint32_t limit_occupancy_shmem; - - void print() const { - std::cerr << "path=" << path << " workgroup_size=" << workgroup_size << " subgroup_size=" << subgroup_size << - " block_rows=" << block_rows << " block_cols=" << block_cols << " d_split=" << d_split << - " row_split=" << row_split << " shmem_staging=" << shmem_staging << " disable_subgroups=" << disable_subgroups << - " limit_occupancy_shmem=" << limit_occupancy_shmem << std::endl; - } -}; - -static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type, ggml_type v_type); -static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type = GGML_TYPE_F16); - -static vk_fa_tuning_params get_fa_tuning_params_scalar(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc) { - - vk_fa_tuning_params result{}; - result.path = FA_SCALAR; - - if (device->vendor_id == VK_VENDOR_ID_INTEL) { - // Disable subgroup use due to performance issues when enforcing subgroup sizes - result.subgroup_size = 32; - result.disable_subgroups = true; - } else if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN) { - result.subgroup_size = n_rows < 4 ? 32 : device->subgroup_size; - } else { - result.subgroup_size = device->subgroup_size; - } - - // Row split splits the workgroup so that synchronization only has to happen within subgroups, which avoids barriers - uint32_t row_split_max_hsk = 64; - if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN && !device->uma) { - row_split_max_hsk = n_rows <= 8 ? 64 : 128; - } - result.row_split = (n_rows < 4 || hsk <= row_split_max_hsk) ? 1 : 4; - - if (result.subgroup_size > 32 && (n_rows < 4 || hsk < (result.row_split == 1 ? 128 : 64))) { - result.workgroup_size = result.subgroup_size * 2; - } else { - result.workgroup_size = result.subgroup_size * 4; - } - - const uint32_t D = hsk | hsv; - - const bool reduce_block_rows = D & 8 || n_kv < 1024 || device->vendor_id == VK_VENDOR_ID_INTEL; - - if (n_rows == 1) { - result.block_rows = 1; - result.block_cols = 64; - } else { - // row_split 1 means higher register use per row, so block size has to be adjusted - if (result.row_split == 1) { - result.block_rows = n_rows == 2 ? 2 : ((n_rows <= 4 || reduce_block_rows) ? 4 : 8); - } else { - result.block_rows = n_rows <= 4 ? 4 : ((n_rows <= 8 || reduce_block_rows) ? 8 : 16); - } - - result.block_cols = (D & 8) ? 64 : 32; - } - - const uint32_t D_lsb = D ^ (D & (D-1)); // extract lowest set bit - - result.d_split = std::min(std::min(result.subgroup_size, 8u), D_lsb / 4); - - result.shmem_staging = (device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256 && hsv < 256) ? 1 : 0; - - if (!reduce_block_rows && !ggml_vk_flash_attn_scalar_shmem_support(device, result, hsk, hsv, f32acc, k_type, v_type)) { - result.block_rows /= 2; - } - - // On AMD RDNA, for small head sizes and big batch size the shader uses few registers, so too many subgroups get scheduled - // at once and end up thrashing the cache. Fix this by setting a large (unused) shmem buffer that reduces occupancy. - // This targets an occupancy of 4 subgroups per SIMD. - if (device->vendor_id == VK_VENDOR_ID_AMD && device->properties.limits.maxComputeSharedMemorySize == 65536) { - if (device->architecture != AMD_GCN && n_rows >= 64 && hsk <= 128) { - // 30kb target for hsk > 64, 26kb for <= 64 due to smaller workgroup size - // Values are guessed, tested on RDNA2 - result.limit_occupancy_shmem = (hsk <= 64 ? 26 : 30) * 1024 / 4 / 4; - } else if (device->architecture == AMD_GCN && n_rows <= 8 && hsk >= 256) { - // Same thing for GCN, with an occupancy target of 2 subgroups per SIMD. - // Here low-batch FA with large head size is affected. - // n_rows < 4 switch because workgroup size switches from 128 to 256 there. - result.limit_occupancy_shmem = (n_rows < 4 ? 14 : 26) * 1024 / 4 / 4; - } - } - - return result; -} - -static vk_fa_tuning_params get_fa_tuning_params_coopmat1(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc) { - GGML_UNUSED(n_rows); - GGML_UNUSED(n_kv); - GGML_UNUSED(k_type); - GGML_UNUSED(v_type); - GGML_UNUSED(f32acc); - - vk_fa_tuning_params result{}; - result.path = FA_COOPMAT1; - - const uint32_t D = hsk | hsv; - - const uint32_t coopmat_block_rows = 16; - const uint32_t coopmat_block_cols = 16; - - const uint32_t num_subgroups = 4; - - result.block_rows = coopmat_block_rows; - result.block_cols = coopmat_block_cols * num_subgroups; - result.row_split = num_subgroups; - result.subgroup_size = device->subgroup_size; - result.workgroup_size = num_subgroups * result.subgroup_size; - - const uint32_t D_lsb = D ^ (D & (D-1)); // extract lowest set bit - result.d_split = std::min(std::min(result.subgroup_size, 8u), D_lsb / 4); - - result.shmem_staging = (device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256 && hsv < 256) ? 1 : 0; - - return result; -} - -static vk_fa_tuning_params get_fa_tuning_params_coopmat2(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc) { - GGML_UNUSED(n_kv); - GGML_UNUSED(f32acc); - - vk_fa_tuning_params result{}; - result.path = FA_COOPMAT2; - - const uint32_t D = hsk | hsv; - - const bool small_rows = n_rows < 32; - - if (small_rows) { - result.block_rows = 32; - result.block_cols = 32; - } else if (ggml_is_quantized(k_type) || ggml_is_quantized(v_type) || hsk >= 256 || hsv >= 256) { - result.block_rows = (hsk >= 512 || hsv >= 512) ? 32 : 64; - result.block_cols = 32; - } else { - result.block_rows = 64; - result.block_cols = 64; - } - - result.subgroup_size = device->subgroup_size; - result.workgroup_size = (small_rows && (D % 32) == 0) ? 256 : 128; - - return result; -} - -static vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc) { - FaCodePath path = device->coopmat2 ? FA_COOPMAT2 : - device->coopmat1_fa_support ? FA_COOPMAT1 : FA_SCALAR; - - if (path == FA_COOPMAT2 && k_type == GGML_TYPE_BF16 && !device->coopmat2_bf16_support) { - path = FA_COOPMAT1; - } - if (path == FA_COOPMAT1 && k_type == GGML_TYPE_BF16 && !device->coopmat_bf16_support) { - path = FA_SCALAR; - } - - if (path == FA_COOPMAT1 && device->architecture == vk_device_architecture::NVIDIA_TURING) { - // Nvidia compiler bug, see https://github.com/ggml-org/llama.cpp/pull/19075#issuecomment-3820716090 - path = FA_SCALAR; - } - - if (path == FA_COOPMAT1) { - bool shape_ok = (f32acc && device->coopmat_support_16x16x16_f32acc) || - (!f32acc && device->coopmat_support_16x16x16_f16acc); - const vk_fa_tuning_params params = get_fa_tuning_params_coopmat1(device, hsk, hsv, n_rows, n_kv, k_type, v_type, f32acc); - bool shmem_ok = ggml_vk_flash_attn_coopmat_shmem_support(device, params, hsk, hsv, f32acc, k_type); - - if (!shape_ok || !shmem_ok) { - path = FA_SCALAR; - } - } - - // scalar is faster than coopmat when N==1 - if (n_rows == 1 && (path == FA_COOPMAT1 || path == FA_COOPMAT2)) { - path = FA_SCALAR; - } - - // Q1_0 K/V is only implemented on coopmat2 (flash_attn_cm2); there is no scalar FA shader for it. - if ((k_type == GGML_TYPE_Q1_0 || v_type == GGML_TYPE_Q1_0) && device->coopmat2) { - path = FA_COOPMAT2; - } - - switch (path) { - case FA_SCALAR: - return get_fa_tuning_params_scalar(device, hsk, hsv, n_rows, n_kv, k_type, v_type, f32acc); - case FA_COOPMAT1: - return get_fa_tuning_params_coopmat1(device, hsk, hsv, n_rows, n_kv, k_type, v_type, f32acc); - case FA_COOPMAT2: - return get_fa_tuning_params_coopmat2(device, hsk, hsv, n_rows, n_kv, k_type, v_type, f32acc); - default: - throw std::runtime_error("unsupported FaCodePath"); - } -} - -static vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc, - bool use_mask, bool use_mask_opt, bool use_logit_softcap, ggml_type k_type, ggml_type v_type) { - const bool old_amd_windows = device->vendor_id == VK_VENDOR_ID_AMD && device->driver_id == vk::DriverId::eAmdProprietary && - (device->architecture == AMD_GCN || device->architecture == AMD_RDNA1 || device->architecture == AMD_RDNA2); - - uint32_t flags = (use_mask_opt ? 1 : 0) | - (use_mask ? 2 : 0) | - (use_logit_softcap ? 4 : 0) | - (old_amd_windows ? 8 : 0); - - const uint32_t subgroup_size = params.disable_subgroups ? 0 : params.subgroup_size; - - return vk_fa_pipeline_state{hsk, hsv, params.block_rows, params.block_cols, params.d_split, params.row_split, params.shmem_staging, params.path, params.workgroup_size, subgroup_size, aligned, f32acc, flags, params.limit_occupancy_shmem, k_type, v_type}; -} - -static std::vector get_fa_spec_constants(const vk_fa_pipeline_state& state) { - const auto fa_block_bytes = [](ggml_type t) -> uint32_t { - if (t == GGML_TYPE_F32) return 16u; - return (uint32_t) ggml_type_size(t); - }; - return { - /* 0 WorkGroupSize */ state.workgroup_size, - /* 1 Br */ state.Br, - /* 2 Bc */ state.Bc, - /* 3 HSK */ state.HSK, - /* 4 HSV */ state.HSV, - /* 5 Clamp */ static_cast(!state.aligned), - /* 6 D_split */ state.D_split, - /* 7 row_split */ state.row_split, - /* 8 SubGroupSize */ state.subgroup_size, - /* 9 SHMEM_STAGING */ state.shmem_staging ? 1u : 0u, - /*10 Flags */ state.flags, - /*11 LIMIT_OCCUPANCY_SHMEM */ state.limit_occupancy_shmem, - /*12 FaTypeK */ static_cast(state.k_type), - /*13 FaTypeV */ static_cast(state.v_type), - /*14 FaBlockBytesK */ fa_block_bytes(state.k_type), - /*15 FaBlockBytesV */ fa_block_bytes(state.v_type), - }; -} - -static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector& warptile, bool mul_mat_id, ggml_type src0_type) { - - uint32_t lut_size = 0; - switch (src0_type) { - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - // Regular matmul uses the compact uint16_t IQ1 grid; the expanded - // uint32_t grid is only enabled for the q8_1/int-dot vector path. - lut_size = 2*2048; - break; - case GGML_TYPE_IQ2_XXS: - lut_size = 8*256; - break; - case GGML_TYPE_IQ2_XS: - lut_size = 8*512; - break; - case GGML_TYPE_IQ2_S: - lut_size = 8*1024; - break; - case GGML_TYPE_IQ3_XXS: - lut_size = 4*256; - break; - case GGML_TYPE_IQ3_S: - lut_size = 4*512; - break; - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_MXFP4: - lut_size = 4*16; - break; - case GGML_TYPE_NVFP4: - // Same kvalues budget as MXFP4 plus ue4m3_fp32_lut[128] (types.glsl, DATA_A_NVFP4). - lut_size = 4*16 + 128u * (uint32_t)sizeof(float); - break; - default: - break; - } - - // Needs to be kept up to date on shader changes - const uint32_t bank_conflict_offset = device->coopmat_support ? 8 : 1; - const uint32_t type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float); - const uint32_t warps = warptile[0] / warptile[10]; - - const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size; - const uint32_t mmid_row_ids = mul_mat_id ? (warptile[2] * 2 * sizeof(uint16_t)) : 0; - const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0; - const uint32_t ballots_sh = mul_mat_id ? (warps * 4 * sizeof(uint32_t)) : 0; - - const uint32_t total_size = load_bufs + mmid_row_ids + coopmat_stage + lut_size + ballots_sh; - const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; - - VK_LOG_DEBUG("ggml_vk_matmul_shmem_support(warptile=(" << warptile[0] << "," << warptile[1] << "," << warptile[2] << "), " - "mul_mat_id=" << mul_mat_id << ", src0_type=" << ggml_type_name(src0_type) << ", supported=" << supported); - - return supported; -} - -// Shmem usage for the q8_1 mmq shader (mul_mmq.comp), which uses -// block_a_cache / block_b_cache layouts (see mul_mmq_shmem_types.glsl) rather -// than the float load buffers checked by ggml_vk_matmul_shmem_support. -// Sizes follow std430 rules. Returns false for types without a q8_1 pipeline. -static bool ggml_vk_matmul_int_shmem_support(const vk_device& device, const std::vector& warptile, bool mul_mat_id, ggml_type src0_type) { - - // FLOAT_TYPE in the shader is float16_t with fp16 support, otherwise float. - const uint32_t fp_size = device->fp16 ? 2u : 4u; - const uint32_t fp_align = fp_size; - const uint32_t fp2_size = 2u * fp_size; - const uint32_t fp2_align = device->fp16 ? 4u : 8u; - - struct member { uint32_t size, align; }; - auto std430_size = [](std::initializer_list members) { - uint32_t off = 0, struct_align = 1; - for (const auto &m : members) { - off = (off + m.align - 1) & ~(m.align - 1); - off += m.size; - struct_align = std::max(struct_align, m.align); - } - return (off + struct_align - 1) & ~(struct_align - 1); - }; - - uint32_t block_a_size = 0; - switch (src0_type) { - case GGML_TYPE_Q4_0: block_a_size = std430_size({{16, 4}, {fp_size, fp_align}}); break; // qs[16/4] + dm - case GGML_TYPE_Q4_1: block_a_size = std430_size({{16, 4}, {fp2_size, fp2_align}}); break; // qs[16/4] + dm(vec2) - case GGML_TYPE_Q5_0: block_a_size = std430_size({{16, 4}, {4, 4}, {fp_size, fp_align}}); break; // qs[16/4] + qh + dm - case GGML_TYPE_Q5_1: block_a_size = std430_size({{16, 4}, {4, 4}, {fp2_size, fp2_align}}); break; // qs[16/4] + qh + dm(vec2) - case GGML_TYPE_Q8_0: block_a_size = std430_size({{32, 4}, {fp_size, fp_align}}); break; // qs[8] + dm - case GGML_TYPE_MXFP4: block_a_size = std430_size({{32, 4}, {fp_size, fp_align}}); break; // qs[8] + d - case GGML_TYPE_Q2_K: block_a_size = std430_size({{ 8, 4}, {2, 2}, {fp2_size, fp2_align}}); break; // qs[2] + scales(u8vec2) + dm(vec2) - case GGML_TYPE_Q3_K: block_a_size = std430_size({{16, 4}, {fp2_size, fp2_align}}); break; // qs[4] + d_scales(vec2) - case GGML_TYPE_Q4_K: block_a_size = std430_size({{16, 4}, {fp2_size, fp2_align}}); break; // qs[4] + dm(vec2) - case GGML_TYPE_Q5_K: block_a_size = std430_size({{32, 4}, {fp2_size, fp2_align}}); break; // qs[8] + dm(vec2) - case GGML_TYPE_Q6_K: block_a_size = std430_size({{32, 4}, {fp2_size, fp2_align}}); break; // qs[8] + d_scales(vec2) - default: - return false; - } - - // block_b_cache: { int32_t qs[8]; FLOAT_TYPEV2 ds; } - const uint32_t block_b_size = std430_size({{32, 4}, {fp2_size, fp2_align}}); - - const uint32_t BM = warptile[1]; - const uint32_t BN = warptile[2]; - // mul_mmq.comp: BK_STEP=1 for MUL_MAT_ID, 4 otherwise. - const uint32_t BK_STEP = mul_mat_id ? 1u : 4u; - - const uint32_t buf_a_size = BM * BK_STEP * block_a_size; - const uint32_t buf_b_size = BN * BK_STEP * block_b_size; - const uint32_t mmid_row_ids = mul_mat_id ? (BN * 2u * (uint32_t)sizeof(uint16_t)) : 0u; - - const uint32_t warps = warptile[0] / warptile[10]; - const uint32_t ballots_sh = mul_mat_id ? (warps * 4u * (uint32_t)sizeof(uint32_t)) : 0u; - - const uint32_t total_size = buf_a_size + buf_b_size + mmid_row_ids + ballots_sh; - const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; - - VK_LOG_DEBUG("ggml_vk_matmul_int_shmem_support(warptile=(" << warptile[0] << "," << warptile[1] << "," << warptile[2] << "), " - "mul_mat_id=" << mul_mat_id << ", src0_type=" << ggml_type_name(src0_type) << ", total=" << total_size << ", supported=" << supported); - - return supported; -} - -struct GpuPipelineConfig { - // GPU architecture identifier. - // Example: vk_device_architecture::AMD_GCN - vk_device_architecture arch; - - // Mapping of pipeline names to their specific subgroup sizes. - // Example: {"soft_max_f32", 64} - std::unordered_map pipelines; - - // Default subgroup size for this GPU. - // Defaults to 0 if not explicitly provided. - uint32_t default_subgroup_size = 0; -}; - -// Pipeline configuration for RDNA1 GPUs. -static const std::unordered_map rdna1_pipelines = { - {"soft_max", 64}, {"im2col", 64}, - {"argmax", 64}, {"mul_mat_vec", 64}, - {"mul_mat_vec_f16", 32}, {"mul_mat_vec_f32_f16", 32} -}; - -// Pipeline configuration for RDNA2 GPUs. -static const std::unordered_map rdna2_pipelines = { - {"soft_max", 64}, {"im2col", 64}, -}; - -static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32; - -// Define configurations for different GPUs. -static std::vector gpu_pipeline_configs = { - { - vk_device_architecture::AMD_RDNA1, - { - rdna1_pipelines, - }, - RDNA_DEFAULT_SUBGROUP_SIZE - }, - { - vk_device_architecture::AMD_RDNA2, - { - rdna2_pipelines, - }, - RDNA_DEFAULT_SUBGROUP_SIZE - }, -}; - -static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch) { - for (const auto &config : gpu_pipeline_configs) { - if (config.arch == arch) { - auto pipIt = config.pipelines.find(pipeline_name); - if (pipIt != config.pipelines.end()) { - return pipIt->second; - } - std::vector> sorted_pipelines(config.pipelines.begin(), config.pipelines.end()); - std::sort(sorted_pipelines.begin(), sorted_pipelines.end(), - [](const auto &a, const auto &b) { return a.first.size() > b.first.size(); }); - for (const auto &entry : sorted_pipelines) { - if (pipeline_name.find(entry.first) != std::string::npos) { - return entry.second; - } - } - return config.default_subgroup_size; - } - } - return 0; // If no matching configuration is found -} - -// Whether scalar flash attention will use the MMQ path for the given k_type. -static bool ggml_vk_fa_scalar_uses_mmq(const vk_device& device, ggml_type k_type) { -#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) - return device->integer_dot_product && device->subgroup_clustered && - (k_type == GGML_TYPE_Q4_0 || k_type == GGML_TYPE_Q4_1 || - k_type == GGML_TYPE_Q5_0 || k_type == GGML_TYPE_Q5_1 || - k_type == GGML_TYPE_Q8_0); -#else - GGML_UNUSED(device); - GGML_UNUSED(k_type); - return false; -#endif -} - -// load_shaders walks the pipeline list under compile_mutex and either claims -// the requested pipeline for compilation or, if another thread is already -// compiling it, drops the lock and waits on compile_cv. Compiles themselves -// run unlocked. -struct CompileTask { - vk_pipeline pipeline; - size_t spv_size; - const void * spv_data; - std::string entrypoint; - uint32_t parameter_count; - std::array wg_denoms; - std::vector specialization_constants; - bool disable_robustness; - bool require_full_subgroups; - uint32_t required_subgroup_size; -}; - -static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { - VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); - - // some shaders have a minimum subgroup size - const uint32_t subgroup_size_8 = std::max(device->subgroup_size, 8u); - const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u); - const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u); - - const uint32_t mul_mat_subgroup_size = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size; - const uint32_t mul_mat_subgroup_size_8 = std::max(mul_mat_subgroup_size, 8u); - const uint32_t mul_mat_subgroup_size_16 = std::max(mul_mat_subgroup_size, 16u); - const uint32_t mul_mat_subgroup_size_32 = std::max(mul_mat_subgroup_size, 32u); - - const bool subgroup_min_size_16 = (!device->subgroup_size_control && device->subgroup_size >= 16) || - (device->subgroup_size_control && device->subgroup_max_size >= 16); - - // mulmat - std::vector l_warptile, m_warptile, s_warptile, - l_warptile_id, m_warptile_id, s_warptile_id, - l_warptile_mmq, m_warptile_mmq, s_warptile_mmq, - l_warptile_mmq_int, m_warptile_mmq_int, s_warptile_mmq_int, - l_warptile_mmq_int_k, m_warptile_mmq_int_k, s_warptile_mmq_int_k, - l_warptile_mmq_k, m_warptile_mmq_k, s_warptile_mmq_k, - l_warptile_mmqid, m_warptile_mmqid, s_warptile_mmqid, - l_warptile_mmqid_int, m_warptile_mmqid_int, s_warptile_mmqid_int, - l_warptile_mmqid_int_k, m_warptile_mmqid_int_k, s_warptile_mmqid_int_k; - std::array l_wg_denoms, m_wg_denoms, s_wg_denoms, - l_mmq_wg_denoms, m_mmq_wg_denoms, s_mmq_wg_denoms, - l_mmq_wg_denoms_k, m_mmq_wg_denoms_k, s_mmq_wg_denoms_k, - l_mmqid_wg_denoms, m_mmqid_wg_denoms, s_mmqid_wg_denoms; - - uint32_t l_align, m_align, s_align; - - vk_pipeline wait_pipeline; - CompileTask claimed_task {}; - bool has_claimed_task = false; - - // The rest of the walk reads and writes shared device state, so hold the - // lock until we're done deciding what to compile. - std::unique_lock compile_lock(device->compile_mutex); - - if (device->coopmat2) { - // spec constants and tile sizes for non-quant matmul/matmul_id - l_warptile = { 256, 128, 256, 64, 1 }; - m_warptile = { 256, 128, 128, 64, 0 }; - s_warptile = { 128, 64, 64, 64, 0 }; - l_wg_denoms = {128, 256, 1 }; - m_wg_denoms = {128, 128, 1 }; - s_wg_denoms = { 64, 64, 1 }; - - // spec constants and tile sizes for quant matmul (non-Qi_K) - l_warptile_mmq = { 256, 128, 256, 64, 1 }; - m_warptile_mmq = { 256, 128, 128, 64, 1 }; - s_warptile_mmq = { 256, 32, 64, 128, 0 }; - l_mmq_wg_denoms = { 128, 256, 1 }; - m_mmq_wg_denoms = { 128, 128, 1 }; - s_mmq_wg_denoms = { 32, 64, 1 }; - - // spec constants and tile sizes for quant matmul (Qi_K) - l_warptile_mmq_k = { 256, 128, 256, 64, 1 }; - m_warptile_mmq_k = { 256, 128, 128, 64, 1 }; - s_warptile_mmq_k = { 256, 32, 64, 128, 0 }; - l_mmq_wg_denoms_k = { 128, 256, 1 }; - m_mmq_wg_denoms_k = { 128, 128, 1 }; - s_mmq_wg_denoms_k = { 32, 64, 1 }; - - // spec constants and tile sizes for quant matmul_id - const uint32_t mmqid_bk = device->coopmat2_decode_vector ? 64u : 32u; - l_warptile_mmqid = { 256, 128, 128, mmqid_bk, 1, device->subgroup_size }; - m_warptile_mmqid = { 256, 128, 64, mmqid_bk, 0, device->subgroup_size }; - s_warptile_mmqid = { 256, 128, 64, mmqid_bk, 0, device->subgroup_size }; - l_mmqid_wg_denoms = { 128, 128, 1 }; - m_mmqid_wg_denoms = { 128, 64, 1 }; - s_mmqid_wg_denoms = { 128, 64, 1 }; - - l_align = 128; - m_align = 64; - s_align = 32; - } else { - // Matrix cores require different warp group sizes - const uint32_t tm_l = device->coopmat_support ? device->coopmat_m : 4; - const uint32_t tm_m = device->coopmat_support ? device->coopmat_m : 4; - const uint32_t tm_s = device->coopmat_support ? device->coopmat_m : 2; - const uint32_t tn_l = device->coopmat_support ? device->coopmat_n : 4; - const uint32_t tn_m = device->coopmat_support ? device->coopmat_n : 2; - const uint32_t tn_s = device->coopmat_support ? device->coopmat_n : 2; - const uint32_t tk_l = device->coopmat_support ? device->coopmat_k : 1; - const uint32_t tk_m = device->coopmat_support ? device->coopmat_k : 1; - const uint32_t tk_s = device->coopmat_support ? device->coopmat_k : 1; - - const uint32_t s_warptile_wm = device->subgroup_size == 8 ? 8 : 32; - - l_warptile = { 128, 128, 128, 16, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 }; - m_warptile = { 128, 64, 64, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; - s_warptile = { subgroup_size_32, 32, 32, 16, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 }; - - l_warptile_mmq = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 }; - m_warptile_mmq = { 128, 64, 64, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; - s_warptile_mmq = { subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 }; - - // Integer MMQ has a smaller shared memory profile, but heavier register use - l_warptile_mmq_int = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, 4, 4, 1, subgroup_size_8 }; - m_warptile_mmq_int = { 128, 64, 64, 32, subgroup_size_8, 32, 2, 2, 2, 1, subgroup_size_8 }; - s_warptile_mmq_int = { subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, 2, 1, 1, subgroup_size_8 }; - - // K-quants use even more registers, mitigate by setting WMITER to 1 - l_warptile_mmq_int_k = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 1, 4, 4, 1, subgroup_size_8 }; - m_warptile_mmq_int_k = { 128, 64, 64, 32, subgroup_size_8, 32, 1, 2, 2, 1, subgroup_size_8 }; - s_warptile_mmq_int_k = { subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 1, 2, 1, 1, subgroup_size_8 }; - - l_warptile_id = { 128, 128, 128, 16, mul_mat_subgroup_size_16 * 2, 64, 2, tm_l, tn_l, tk_l, mul_mat_subgroup_size_16 }; - m_warptile_id = { 128, 64, 64, 16, mul_mat_subgroup_size_16, 32, 2, tm_m, tn_m, tk_m, mul_mat_subgroup_size_16 }; - s_warptile_id = { mul_mat_subgroup_size_16, 32, 32, 16, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, mul_mat_subgroup_size_16 }; - - l_warptile_mmqid = { 128, 128, 128, 32, mul_mat_subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, mul_mat_subgroup_size_8 }; - m_warptile_mmqid = { 128, 64, 64, 32, mul_mat_subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, mul_mat_subgroup_size_8 }; - s_warptile_mmqid = { mul_mat_subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, mul_mat_subgroup_size_8 }; - - l_warptile_mmqid_int = { 128, 128, 128, 32, mul_mat_subgroup_size_8 * 2, 64, 2, 4, 4, 1, mul_mat_subgroup_size_8 }; - m_warptile_mmqid_int = { 128, 64, 64, 32, mul_mat_subgroup_size_8, 32, 2, 2, 2, 1, mul_mat_subgroup_size_8 }; - s_warptile_mmqid_int = { mul_mat_subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, 2, 1, 1, mul_mat_subgroup_size_8 }; - - l_warptile_mmqid_int_k = { 128, 128, 128, 32, mul_mat_subgroup_size_16 * 2, 64, 1, 4, 4, 1, mul_mat_subgroup_size_16 }; - m_warptile_mmqid_int_k = { 128, 64, 64, 32, mul_mat_subgroup_size_16, 32, 1, 2, 2, 1, mul_mat_subgroup_size_16 }; - s_warptile_mmqid_int_k = { mul_mat_subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 1, 2, 1, 1, mul_mat_subgroup_size_16 }; - - // chip specific tuning - if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) { - m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 }; - m_warptile_mmqid = m_warptile_mmqid_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 }; - } else if (device->vendor_id == VK_VENDOR_ID_AMD && device->coopmat_support && device->driver_id != vk::DriverId::eAmdProprietary) { - // This is intentionally using tx_m values, slight performance increase - l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; - l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; - l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 }; - } else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) { - // Xe2/Xe3 with coopmat enabled - warptile performance tuning - l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; - l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; - } - - l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; - m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; - s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; - l_align = 128; - m_align = 64; - s_align = 32; - - for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) { - ggml_type t = (ggml_type)i; - // Disable medium and large matrix multiplication if not enough shared memory is available - // Check mmq warptiles as the largest configuration - // Throw an error if not enough for any matrix multiplication is available - if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, false, t)) { - std::cerr << "ggml_vulkan: Error: Shared memory size too small for matrix multiplication." << std::endl; - throw std::runtime_error("Shared memory size too small for matrix multiplication."); - } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, false, t)) { - device->mul_mat_m[i] = false; - device->mul_mat_l[i] = false; - } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, false, t)) { - device->mul_mat_l[i] = false; - } - - // Disable mul_mat_id if not enough shared memory is available - if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmqid, true, t)) { - device->mul_mat_id_s[i] = false; - device->mul_mat_id_m[i] = false; - device->mul_mat_id_l[i] = false; - } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmqid, true, t)) { - device->mul_mat_id_m[i] = false; - device->mul_mat_id_l[i] = false; - } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmqid, true, t)) { - device->mul_mat_id_l[i] = false; - } - - // The q8_1 mmq path has its own (larger) shmem layout, check it separately. - // K-quants use the _int_k warptiles, others use _int. - const bool is_k_quant = (t == GGML_TYPE_Q2_K || t == GGML_TYPE_Q3_K || - t == GGML_TYPE_Q4_K || t == GGML_TYPE_Q5_K || - t == GGML_TYPE_Q6_K); - const auto & s_int = is_k_quant ? s_warptile_mmq_int_k : s_warptile_mmq_int; - const auto & m_int = is_k_quant ? m_warptile_mmq_int_k : m_warptile_mmq_int; - const auto & l_int = is_k_quant ? l_warptile_mmq_int_k : l_warptile_mmq_int; - const auto & s_intid = is_k_quant ? s_warptile_mmqid_int_k : s_warptile_mmqid_int; - const auto & m_intid = is_k_quant ? m_warptile_mmqid_int_k : m_warptile_mmqid_int; - const auto & l_intid = is_k_quant ? l_warptile_mmqid_int_k : l_warptile_mmqid_int; - - if (!ggml_vk_matmul_int_shmem_support(device, s_int, false, t)) { - device->mul_mat_s_int[i] = false; - device->mul_mat_m_int[i] = false; - device->mul_mat_l_int[i] = false; - } else if (!ggml_vk_matmul_int_shmem_support(device, m_int, false, t)) { - device->mul_mat_m_int[i] = false; - device->mul_mat_l_int[i] = false; - } else if (!ggml_vk_matmul_int_shmem_support(device, l_int, false, t)) { - device->mul_mat_l_int[i] = false; - } - - if (!ggml_vk_matmul_int_shmem_support(device, s_intid, true, t)) { - device->mul_mat_id_s_int[i] = false; - device->mul_mat_id_m_int[i] = false; - device->mul_mat_id_l_int[i] = false; - } else if (!ggml_vk_matmul_int_shmem_support(device, m_intid, true, t)) { - device->mul_mat_id_m_int[i] = false; - device->mul_mat_id_l_int[i] = false; - } else if (!ggml_vk_matmul_int_shmem_support(device, l_intid, true, t)) { - device->mul_mat_id_l_int[i] = false; - } - } - } - - if (!device->pipeline_matmul_f32) { - device->pipeline_matmul_f32 = std::make_shared(); - } - if (!device->pipeline_matmul_f32_f16) { - device->pipeline_matmul_f32_f16 = std::make_shared(); - } - if (!device->pipeline_matmul_id_f32) { - device->pipeline_matmul_id_f32 = std::make_shared(); - } - if (!device->pipeline_matmul_bf16) { - device->pipeline_matmul_bf16 = std::make_shared(); - } - if (!device->pipeline_matmul_id_bf16) { - device->pipeline_matmul_id_bf16 = std::make_shared(); - } - - auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& base_pipeline, const char *name, size_t spv_size, const void* spv_data, const char *entrypoint, - uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, - uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) { - - if (!require_full_subgroups && required_subgroup_size == 0) { - required_subgroup_size = get_subgroup_size(name, device->architecture); - } - - vk_pipeline *ptr = &base_pipeline; - - int num_pipelines = 1; -#if defined(VK_EXT_shader_64bit_indexing) - if (device->shader_64b_indexing) { - num_pipelines = 2; - } -#endif - for (int i = 0; i < num_pipelines; ++i, ptr = &(*ptr)->next) { - vk_pipeline &pipeline = *ptr; - if (!pipeline) { - pipeline = std::make_shared(); - } - if (!pipeline->initialized) { - pipeline->name = name; - pipeline->parameter_count = parameter_count; - pipeline->push_constant_size = push_constant_size; - pipeline->wg_denoms = wg_denoms; - pipeline->align = align; - pipeline->initialized = true; -#if defined(VK_EXT_shader_64bit_indexing) - pipeline->is_64b_indexing = (i == 1); -#endif - } - - // We only care about the pipeline this call asked for; the rest - // (including the 64-bit indexing variant) are handled by their - // own request_descriptor_sets / load_shaders calls. - if (pipeline.get() != requested.get()) { - continue; - } - - if (pipeline->compiled) { - continue; - } - - wait_pipeline = pipeline; - - if (!pipeline->compile_pending) { - pipeline->compile_pending = true; - claimed_task.pipeline = pipeline; - claimed_task.spv_size = spv_size; - claimed_task.spv_data = spv_data; - claimed_task.entrypoint = entrypoint; - claimed_task.parameter_count = parameter_count; - claimed_task.wg_denoms = wg_denoms; - claimed_task.specialization_constants = specialization_constants; - claimed_task.disable_robustness = disable_robustness; - claimed_task.require_full_subgroups = require_full_subgroups; - claimed_task.required_subgroup_size = required_subgroup_size; - has_claimed_task = true; - } - } - }; - - auto const &ggml_vk_create_pipeline2 = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const char *entrypoint, - uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, - uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) { - return ggml_vk_create_pipeline(device, pipeline, name.c_str(), spv_size, spv_data, entrypoint, - parameter_count, push_constant_size, wg_denoms, specialization_constants, - align, disable_robustness, require_full_subgroups, required_subgroup_size); - }; - - // FA scalar has two SPIR-V modules (MMQ vs non-MMQ); FA cm1 has one. K/V - // quant type is selected at runtime via the FaTypeK / FaTypeV spec constants. - - for (auto &fa : device->pipeline_flash_attn_f32_f16) { - if (fa.first.path != FA_SCALAR) continue; - const uint32_t Br = fa.first.Br; - const uint32_t Bc = fa.first.Bc; - const bool aligned = fa.first.aligned; - const bool f32acc = fa.first.f32acc; - const uint32_t fa_sgs = fa.first.subgroup_size; - const bool fa_ds = fa.first.subgroup_size == 0; - - const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16; - const bool use_mmq = ggml_vk_fa_scalar_uses_mmq(device, fa.first.k_type); - const void * spv_data = nullptr; - size_t spv_size = 0; - const char *name = nullptr; - if (bf16_kv) { - spv_data = flash_attn_f32_f16_fp32_data; - spv_size = flash_attn_f32_f16_fp32_len; - name = aligned ? "flash_attn_f32_bf16_aligned" : "flash_attn_f32_bf16"; - } else if (use_mmq) { -#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) - if (device->fp16) { - if (f32acc) { spv_data = flash_attn_f32_f16_int8_data; spv_size = flash_attn_f32_f16_int8_len; } - else { spv_data = flash_attn_f32_f16_f16acc_int8_data; spv_size = flash_attn_f32_f16_f16acc_int8_len; } - } else { - spv_data = flash_attn_f32_f16_fp32_int8_data; - spv_size = flash_attn_f32_f16_fp32_int8_len; - } -#endif - name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16"; - } else { - if (device->fp16) { - if (device->dot2_f16) { - if (f32acc) { spv_data = flash_attn_f32_f16_dot2_data; spv_size = flash_attn_f32_f16_dot2_len; } - else { spv_data = flash_attn_f32_f16_dot2_f16acc_data; spv_size = flash_attn_f32_f16_dot2_f16acc_len; } - } else { - if (f32acc) { spv_data = flash_attn_f32_f16_data; spv_size = flash_attn_f32_f16_len; } - else { spv_data = flash_attn_f32_f16_f16acc_data; spv_size = flash_attn_f32_f16_f16acc_len; } - } - } else { - spv_data = flash_attn_f32_f16_fp32_data; - spv_size = flash_attn_f32_f16_fp32_len; - } - name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16"; - } - ggml_vk_create_pipeline(device, fa.second, name, spv_size, spv_data, "main", 7, - sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, - get_fa_spec_constants(fa.first), aligned ? Bc : 1, true, - !fa_ds, !fa_ds ? fa_sgs : 0); - } - -#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) - if (device->coopmat1_fa_support) { - for (auto &fa : device->pipeline_flash_attn_f32_f16) { - if (fa.first.path != FA_COOPMAT1) continue; - const uint32_t Br = fa.first.Br; - const uint32_t Bc = fa.first.Bc; - const bool aligned = fa.first.aligned; - const bool f32acc = fa.first.f32acc; - const uint32_t fa_sgs = fa.first.subgroup_size; - const bool fa_ds = fa.first.subgroup_size == 0; - - const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16; - - const void * spv_data; - size_t spv_size; - const char *name; - if (bf16_kv) { -#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) - if (!device->coopmat_bf16_support) continue; - spv_data = flash_attn_f32_f16_bf16_cm1_data; - spv_size = flash_attn_f32_f16_bf16_cm1_len; - name = aligned ? "flash_attn_f32_bf16_aligned_cm1" : "flash_attn_f32_bf16_cm1"; -#else - continue; -#endif - } else { - if (f32acc) { spv_data = flash_attn_f32_f16_cm1_data; spv_size = flash_attn_f32_f16_cm1_len; } - else { spv_data = flash_attn_f32_f16_f16acc_cm1_data; spv_size = flash_attn_f32_f16_f16acc_cm1_len; } - name = aligned ? "flash_attn_f32_f16_aligned_cm1" : "flash_attn_f32_f16_cm1"; - } - ggml_vk_create_pipeline(device, fa.second, name, spv_size, spv_data, "main", 7, - sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, - get_fa_spec_constants(fa.first), aligned ? Bc : 1, true, - !fa_ds, !fa_ds ? fa_sgs : 0); - } - } -#endif - -#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) - if (device->coopmat2) { - for (auto &fa : device->pipeline_flash_attn_f32_f16) { - if (fa.first.path != FA_COOPMAT2) continue; - const uint32_t Br = fa.first.Br; - const uint32_t Bc = fa.first.Bc; - const bool aligned = fa.first.aligned; - const bool f32acc = fa.first.f32acc; - - const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16; - const void * spv_data; - size_t spv_size; - const char * name; - if (bf16_kv) { -#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) - if (!device->coopmat2_bf16_support) continue; - spv_data = flash_attn_f32_f16_bf16_cm2_data; - spv_size = flash_attn_f32_f16_bf16_cm2_len; - name = aligned ? "flash_attn_f32_bf16_aligned_cm2" : "flash_attn_f32_bf16_cm2"; -#else - continue; -#endif - } else if (aligned) { - if (f32acc) { spv_data = flash_attn_f32_f16_cm2_data; spv_size = flash_attn_f32_f16_cm2_len; name = "flash_attn_f32_f16_aligned_f32acc_cm2"; } - else { spv_data = flash_attn_f32_f16_f16acc_cm2_data; spv_size = flash_attn_f32_f16_f16acc_cm2_len; name = "flash_attn_f32_f16_aligned_f16acc_cm2"; } - } else { - if (f32acc) { spv_data = flash_attn_f32_f16_cm2_data; spv_size = flash_attn_f32_f16_cm2_len; name = "flash_attn_f32_f16_f32acc_cm2"; } - else { spv_data = flash_attn_f32_f16_f16acc_cm2_data; spv_size = flash_attn_f32_f16_f16acc_cm2_len; name = "flash_attn_f32_f16_f16acc_cm2"; } - } - ggml_vk_create_pipeline(device, fa.second, name, spv_size, spv_data, "main", 7, - sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, - get_fa_spec_constants(fa.first), aligned ? Bc : 1, true, false, 0); - } - } -#endif - - const int mul_mat_id_param_count = 5; - -#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) - if (device->coopmat2) { - - // Create 6 variants, {s,m,l}x{unaligned,aligned} -#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, true); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, true); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, true); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, true); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, true); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, true); \ - - // Create 2 variants, {f16,f32} accumulator -#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ - CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ - CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ - - CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3) -#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) - if (device->coopmat_bf16_support) { - CREATE_MM(pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3) - } -#endif - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q1_0], matmul_q1_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0], matmul_q4_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1], matmul_q4_1_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0], matmul_q5_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_1], matmul_q5_1_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q8_0], matmul_q8_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q2_K], matmul_q2_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q3_K], matmul_q3_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_K], matmul_q4_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_K], matmul_q5_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K], matmul_q6_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ1_S], matmul_iq1_s_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ1_M], matmul_iq1_m_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XS], matmul_iq2_xs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_S], matmul_iq2_s_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_S], matmul_iq3_s_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_XS], matmul_iq4_xs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL], matmul_iq4_nl_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_MXFP4], matmul_mxfp4_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_NVFP4], matmul_nvfp4_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) - - GGML_ASSERT(device->subgroup_ballot); - - CREATE_MM2(pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 5) -#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) - if (device->coopmat_bf16_support) { - CREATE_MM(pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 5) - } -#endif - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_subgroup_q1_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_subgroup_q5_1_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_subgroup_q8_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_subgroup_q2_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_subgroup_q3_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_subgroup_q4_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_subgroup_q5_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_subgroup_q6_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_subgroup_iq1_s_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_subgroup_iq1_m_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_subgroup_iq2_xxs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_subgroup_iq2_xs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_subgroup_iq2_s_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_subgroup_iq3_xxs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_subgroup_iq3_s_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_subgroup_iq4_xs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_subgroup_iq4_nl_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_subgroup_mxfp4_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) - CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4], matmul_id_subgroup_nvfp4_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) -#undef CREATE_MM -#undef CREATE_MM2 - } else -#endif // defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) -#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) - if (device->coopmat_support) { - // Create 6 variants, {s,m,l}x{unaligned,aligned} -#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ - if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true); \ - if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true); \ - if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true); \ - if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true); \ - if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true); \ - if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true); \ - - // Create 2 variants, {f16,f32} accumulator -#define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ - if (device->coopmat_acc_f16_support) { \ - CREATE_MM(TYPE, PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ - } \ - if (device->coopmat_acc_f32_support) { \ - CREATE_MM(TYPE, PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ - } \ - - CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); -#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) - if (device->coopmat_bf16_support) { - CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ) - } -#endif - - if (device->coopmat_acc_f16_support) { - CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0], matmul_q1_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0], matmul_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1], matmul_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0], matmul_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1], matmul_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0], matmul_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - - CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K], matmul_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K], matmul_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K], matmul_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K], matmul_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K], matmul_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S], matmul_iq1_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M], matmul_iq1_m_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS], matmul_iq2_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S], matmul_iq2_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S], matmul_iq3_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS], matmul_iq4_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL], matmul_iq4_nl_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_MXFP4], matmul_mxfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_NVFP4], matmul_nvfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - } else { - CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0].f32acc, matmul_q1_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - - CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S].f32acc, matmul_iq1_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M].f32acc, matmul_iq1_m_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f32acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f32acc, matmul_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc, matmul_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_MXFP4].f32acc, matmul_mxfp4_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_NVFP4].f32acc, matmul_nvfp4_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - } - - GGML_ASSERT(device->subgroup_ballot); - - CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); -#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) - if (device->coopmat_bf16_support) { - CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - } -#endif - - CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_subgroup_q1_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_subgroup_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_subgroup_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_subgroup_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_subgroup_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_subgroup_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_subgroup_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_subgroup_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_subgroup_iq1_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_subgroup_iq1_m_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_subgroup_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_subgroup_iq2_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_subgroup_iq2_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_subgroup_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_subgroup_iq3_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_subgroup_iq4_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_subgroup_iq4_nl_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_subgroup_mxfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); - CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4], matmul_id_subgroup_nvfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); -#undef CREATE_MM2 -#undef CREATE_MM - } else -#endif // defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) - if (device->fp16) { - // Create 6 variants, {s,m,l}x{unaligned,aligned} - // Selects dot2 SPIR-V variant at runtime when device->dot2_f16 is true -#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ - if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - - // bf16 scalar path promotes to f32, no dot2 variant -#define CREATE_MM_NODOT2(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ - if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - -#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ - if (device->mul_mat ## ID ## _l_int[TYPE]) { \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->l, #NAMELC "_l", NAMELC ## _len, NAMELC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - } \ - if (device->mul_mat ## ID ## _m_int[TYPE]) { \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->m, #NAMELC "_m", NAMELC ## _len, NAMELC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - } \ - if (device->mul_mat ## ID ## _s_int[TYPE]) { \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->s, #NAMELC "_s", NAMELC ## _len, NAMELC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - } \ - - // Create 2 variants, {f16,f32} accumulator -#define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ - CREATE_MM(TYPE, PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ - CREATE_MM(TYPE, PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ - - CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); - - CREATE_MM_NODOT2(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); - - CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0], matmul_q1_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0], matmul_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1], matmul_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0], matmul_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1], matmul_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0], matmul_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K], matmul_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K], matmul_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K], matmul_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K], matmul_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K], matmul_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S], matmul_iq1_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M], matmul_iq1_m_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS], matmul_iq2_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S], matmul_iq2_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S], matmul_iq3_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS], matmul_iq4_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL], matmul_iq4_nl_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_MXFP4], matmul_mxfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_NVFP4], matmul_nvfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - -#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) - if (device->integer_dot_product) { - CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0], matmul_q4_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); - CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1], matmul_q4_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); - CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0], matmul_q5_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); - CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1], matmul_q5_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); - CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0], matmul_q8_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); - - CREATE_MMQ(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_MXFP4], matmul_mxfp4_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); - - CREATE_MMQ(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q2_K], matmul_q2_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); - CREATE_MMQ(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q3_K], matmul_q3_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); - CREATE_MMQ(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_K], matmul_q4_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); - CREATE_MMQ(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_K], matmul_q5_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); - CREATE_MMQ(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q6_K], matmul_q6_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); - } -#endif - - if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) { - CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); - CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); - CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); - CREATE_MM_NODOT2(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); - CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_subgroup_q1_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_subgroup_q5_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_subgroup_q8_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_subgroup_q2_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_subgroup_q3_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_subgroup_q4_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_subgroup_q5_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_subgroup_q6_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_subgroup_iq1_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_subgroup_iq1_m_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_subgroup_iq2_xxs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_subgroup_iq2_xs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_subgroup_iq2_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_subgroup_iq3_xxs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_subgroup_iq3_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_subgroup_iq4_xs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_subgroup_iq4_nl_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_subgroup_mxfp4_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4], matmul_id_subgroup_nvfp4_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - -#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) - if (device->integer_dot_product) { - CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_1], matmul_id_subgroup_q5_1_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q8_0], matmul_id_subgroup_q8_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - - CREATE_MMQ(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_MXFP4], matmul_id_subgroup_mxfp4_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - - CREATE_MMQ(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q2_K], matmul_id_subgroup_q2_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); - CREATE_MMQ(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q3_K], matmul_id_subgroup_q3_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); - CREATE_MMQ(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_K], matmul_id_subgroup_q4_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); - CREATE_MMQ(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_K], matmul_id_subgroup_q5_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); - CREATE_MMQ(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q6_K], matmul_id_subgroup_q6_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); - } -#endif - } else { - CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM_NODOT2(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_q1_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_iq1_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_iq1_m_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_iq2_xxs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_iq2_xs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_iq2_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_iq3_xxs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_iq3_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_iq4_xs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_mxfp4_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4], matmul_id_nvfp4_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - -#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) - if (device->integer_dot_product) { - CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_0], matmul_id_q4_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_1], matmul_id_q4_1_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_0], matmul_id_q5_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_1], matmul_id_q5_1_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q8_0], matmul_id_q8_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - - CREATE_MMQ(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_MXFP4], matmul_id_mxfp4_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - - CREATE_MMQ(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q2_K], matmul_id_q2_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MMQ(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q3_K], matmul_id_q3_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MMQ(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_K], matmul_id_q4_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MMQ(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_K], matmul_id_q5_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MMQ(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q6_K], matmul_id_q6_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - } -#endif - } -#undef CREATE_MM2 -#undef CREATE_MMQ -#undef CREATE_MM -#undef CREATE_MM_NODOT2 - } else { - // Create 6 variants, {s,m,l}x{unaligned,aligned} -#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ - if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ - -#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ - if (device->mul_mat ## ID ## _l_int[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC "_l", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ - if (device->mul_mat ## ID ## _m_int[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC "_m", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ - if (device->mul_mat ## ID ## _s_int[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC "_s", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ - - CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); - - CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); - - CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0].f32acc, matmul_q1_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - - CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S].f32acc, matmul_iq1_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M].f32acc, matmul_iq1_m_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f32acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f32acc, matmul_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc, matmul_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_MXFP4].f32acc, matmul_mxfp4_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_NVFP4].f32acc, matmul_nvfp4_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); - -#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) - if (device->integer_dot_product) { - CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); - CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); - CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); - CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); - CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); - - CREATE_MMQ(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); - CREATE_MMQ(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); - CREATE_MMQ(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); - CREATE_MMQ(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); - CREATE_MMQ(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); - } -#endif - - if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) { - CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); - CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_subgroup_f16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); - CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_subgroup_f16_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); - CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); - - CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0].f32acc, matmul_id_subgroup_q1_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_subgroup_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_subgroup_q4_1_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_subgroup_q5_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f32acc, matmul_id_subgroup_q5_1_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f32acc, matmul_id_subgroup_q8_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f32acc, matmul_id_subgroup_q2_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f32acc, matmul_id_subgroup_q3_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_subgroup_q4_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_subgroup_q5_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_subgroup_q6_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S].f32acc, matmul_id_subgroup_iq1_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M].f32acc, matmul_id_subgroup_iq1_m_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f32acc, matmul_id_subgroup_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f32acc, matmul_id_subgroup_iq2_xs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f32acc, matmul_id_subgroup_iq2_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f32acc, matmul_id_subgroup_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc, matmul_id_subgroup_iq3_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f32acc, matmul_id_subgroup_iq4_xs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_subgroup_iq4_nl_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f32acc, matmul_id_subgroup_mxfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - CREATE_MM(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4].f32acc, matmul_id_subgroup_nvfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); - } else { - CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - - CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0].f32acc, matmul_id_q1_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f32acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f32acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f32acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f32acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S].f32acc, matmul_id_iq1_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M].f32acc, matmul_id_iq1_m_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f32acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f32acc, matmul_id_iq2_xs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f32acc, matmul_id_iq2_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f32acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f32acc, matmul_id_iq4_xs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f32acc, matmul_id_mxfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - CREATE_MM(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4].f32acc, matmul_id_nvfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - } - } - // reusing CREATE_MM from the fp32 path - if ((device->coopmat2 || device->coopmat_support) -#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) - && !device->coopmat_bf16_support -#endif - ) { - const uint32_t s_warptile_wm = device->subgroup_size == 8 ? 8 : 32; - - // use scalar tile sizes - l_warptile = { 128, 128, 128, 16, subgroup_size_8 * 2, 64, 2, 4, 4, 1, subgroup_size_8 }; - m_warptile = { 128, 64, 64, 16, subgroup_size_8, 32, 2, 4, 2, 1, subgroup_size_8 }; - s_warptile = { subgroup_size_32, 32, 32, 16, s_warptile_wm, 32, 2, 2, 2, 1, subgroup_size_8 }; - - l_wg_denoms = {128, 128, 1 }; - m_wg_denoms = { 64, 64, 1 }; - s_wg_denoms = { 32, 32, 1 }; - - CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); - CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); - } -#undef CREATE_MM - - // mul mat vec - - // the number of rows computed per shader depends on GPU model and quant - uint32_t rm_stdq = 1; - uint32_t rm_kq = 2; - uint32_t rm_stdq_int = 1; - uint32_t rm_kq_int = 1; - auto const &rm_iq_int = [](uint32_t i) { return i == 0 ? 8u : 4u; }; - if (device->vendor_id == VK_VENDOR_ID_AMD) { - if (device->architecture == AMD_GCN) { - rm_stdq = 2; - rm_kq = 4; - rm_stdq_int = 4; - } - } else if (device->vendor_id == VK_VENDOR_ID_INTEL) { - rm_stdq = 2; - rm_stdq_int = 2; - } - uint32_t rm_iq = 2 * rm_kq; - - const bool use_subgroups = device->subgroup_arithmetic && device->architecture != vk_device_architecture::AMD_GCN; - // Ensure a subgroup size >= 16 is available - const bool use_subgroups16 = use_subgroups && subgroup_min_size_16; - - const uint32_t subgroup_size = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control && device->subgroup_min_size <= 16 && device->subgroup_max_size >= 16) ? 16 : device->subgroup_size; - const uint32_t subgroup_size16 = std::max(subgroup_size, 16u); - - const uint32_t force_subgroup_size = use_subgroups ? subgroup_size : 0; - const uint32_t force_subgroup_size16 = use_subgroups16 ? subgroup_size16 : 0; - static constexpr uint32_t mul_mat_vec_num_bindings = 5; - static constexpr uint32_t mul_mat_vec_id_num_bindings = 6; - - for (uint32_t w = 0; w < DMMV_WG_SIZE_COUNT; ++w) { - const uint32_t wg_size_subgroup = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size : (subgroup_size * 4); - const uint32_t wg_size_subgroup16 = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size16 : (subgroup_size16 * 4); - - const shader_reduction_mode reduc = (use_subgroups && w == DMMV_WG_SIZE_SUBGROUP) ? SHADER_REDUCTION_MODE_SUBGROUP : - (use_subgroups && w == DMMV_WG_SIZE_LARGE) ? SHADER_REDUCTION_MODE_HYBRID : - SHADER_REDUCTION_MODE_SHMEM; - - const shader_reduction_mode reduc16 = (use_subgroups16 && w == DMMV_WG_SIZE_SUBGROUP) ? SHADER_REDUCTION_MODE_SUBGROUP : - (use_subgroups16 && w == DMMV_WG_SIZE_LARGE) ? SHADER_REDUCTION_MODE_HYBRID : - SHADER_REDUCTION_MODE_SHMEM; - - for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) { - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32", arr_dmmv_f32_f32_f32_len[reduc], arr_dmmv_f32_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {wg_size_subgroup, 1, i+1}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32", arr_dmmv_f16_f32_f32_len[reduc], arr_dmmv_f16_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f32_f32", arr_dmmv_bf16_f32_f32_len[reduc], arr_dmmv_bf16_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q1_0][i], "mul_mat_vec_q1_0_f32_f32", arr_dmmv_q1_0_f32_f32_len[reduc], arr_dmmv_q1_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32", arr_dmmv_q4_0_f32_f32_len[reduc], arr_dmmv_q4_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32", arr_dmmv_q4_1_f32_f32_len[reduc], arr_dmmv_q4_1_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32", arr_dmmv_q5_0_f32_f32_len[reduc], arr_dmmv_q5_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f32_f32", arr_dmmv_q5_1_f32_f32_len[reduc], arr_dmmv_q5_1_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f32_f32", arr_dmmv_q8_0_f32_f32_len[reduc], arr_dmmv_q8_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f32_f32", arr_dmmv_q2_k_f32_f32_len[reduc16], arr_dmmv_q2_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f32_f32", arr_dmmv_q3_k_f32_f32_len[reduc16], arr_dmmv_q3_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32", arr_dmmv_q4_k_f32_f32_len[reduc16], arr_dmmv_q4_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32", arr_dmmv_q5_k_f32_f32_len[reduc16], arr_dmmv_q5_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32", arr_dmmv_q6_k_f32_f32_len[reduc16], arr_dmmv_q6_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f32_f32", arr_dmmv_iq1_s_f32_f32_len[reduc16], arr_dmmv_iq1_s_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f32_f32", arr_dmmv_iq1_m_f32_f32_len[reduc16], arr_dmmv_iq1_m_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32", arr_dmmv_iq2_xxs_f32_f32_len[reduc16], arr_dmmv_iq2_xxs_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f32_f32", arr_dmmv_iq2_xs_f32_f32_len[reduc16], arr_dmmv_iq2_xs_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32", arr_dmmv_iq2_s_f32_f32_len[reduc16], arr_dmmv_iq2_s_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32", arr_dmmv_iq3_xxs_f32_f32_len[reduc16], arr_dmmv_iq3_xxs_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32", arr_dmmv_iq3_s_f32_f32_len[reduc16], arr_dmmv_iq3_s_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32", arr_dmmv_iq4_xs_f32_f32_len[reduc16], arr_dmmv_iq4_xs_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32", arr_dmmv_iq4_nl_f32_f32_len[reduc16], arr_dmmv_iq4_nl_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f32_f32", arr_dmmv_mxfp4_f32_f32_len[reduc16], arr_dmmv_mxfp4_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_NVFP4][i], "mul_mat_vec_nvfp4_f32_f32", arr_dmmv_nvfp4_f32_f32_len[reduc16], arr_dmmv_nvfp4_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32", arr_dmmv_f32_f16_f32_len[reduc], arr_dmmv_f32_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {wg_size_subgroup, 1, i+1}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32", arr_dmmv_f16_f16_f32_len[reduc], arr_dmmv_f16_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f16_f32", arr_dmmv_bf16_f16_f32_len[reduc], arr_dmmv_bf16_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q1_0][i], "mul_mat_vec_q1_0_f16_f32", arr_dmmv_q1_0_f16_f32_len[reduc], arr_dmmv_q1_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32", arr_dmmv_q4_0_f16_f32_len[reduc], arr_dmmv_q4_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32", arr_dmmv_q4_1_f16_f32_len[reduc], arr_dmmv_q4_1_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32", arr_dmmv_q5_0_f16_f32_len[reduc], arr_dmmv_q5_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f16_f32", arr_dmmv_q5_1_f16_f32_len[reduc], arr_dmmv_q5_1_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f16_f32", arr_dmmv_q8_0_f16_f32_len[reduc], arr_dmmv_q8_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f16_f32", arr_dmmv_q2_k_f16_f32_len[reduc16], arr_dmmv_q2_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f16_f32", arr_dmmv_q3_k_f16_f32_len[reduc16], arr_dmmv_q3_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32", arr_dmmv_q4_k_f16_f32_len[reduc16], arr_dmmv_q4_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32", arr_dmmv_q5_k_f16_f32_len[reduc16], arr_dmmv_q5_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32", arr_dmmv_q6_k_f16_f32_len[reduc16], arr_dmmv_q6_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f16_f32", arr_dmmv_iq1_s_f16_f32_len[reduc16], arr_dmmv_iq1_s_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f16_f32", arr_dmmv_iq1_m_f16_f32_len[reduc16], arr_dmmv_iq1_m_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32", arr_dmmv_iq2_xxs_f16_f32_len[reduc16], arr_dmmv_iq2_xxs_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f16_f32", arr_dmmv_iq2_xs_f16_f32_len[reduc16], arr_dmmv_iq2_xs_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32", arr_dmmv_iq2_s_f16_f32_len[reduc16], arr_dmmv_iq2_s_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32", arr_dmmv_iq3_xxs_f16_f32_len[reduc16], arr_dmmv_iq3_xxs_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32", arr_dmmv_iq3_s_f16_f32_len[reduc16], arr_dmmv_iq3_s_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32", arr_dmmv_iq4_xs_f16_f32_len[reduc16], arr_dmmv_iq4_xs_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32", arr_dmmv_iq4_nl_f16_f32_len[reduc16], arr_dmmv_iq4_nl_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f16_f32", arr_dmmv_mxfp4_f16_f32_len[reduc16], arr_dmmv_mxfp4_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_NVFP4][i], "mul_mat_vec_nvfp4_f16_f32", arr_dmmv_nvfp4_f16_f32_len[reduc16], arr_dmmv_nvfp4_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - -#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) - if (device->integer_dot_product) { - const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size; - const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4); - - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_q8_1_f32", arr_dmmv_q4_0_q8_1_f32_len[reduc], arr_dmmv_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_q8_1_f32", arr_dmmv_q4_1_q8_1_f32_len[reduc], arr_dmmv_q4_1_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_q8_1_f32", arr_dmmv_q5_0_q8_1_f32_len[reduc], arr_dmmv_q5_0_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_q8_1_f32", arr_dmmv_q5_1_q8_1_f32_len[reduc], arr_dmmv_q5_1_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_q8_1_f32", arr_dmmv_q8_0_q8_1_f32_len[reduc], arr_dmmv_q8_0_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); - - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_q8_1_f32", arr_dmmv_mxfp4_q8_1_f32_len[reduc], arr_dmmv_mxfp4_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); - - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_q8_1_f32", arr_dmmv_q2_k_q8_1_f32_len[reduc], arr_dmmv_q2_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_q8_1_f32", arr_dmmv_q3_k_q8_1_f32_len[reduc], arr_dmmv_q3_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_q8_1_f32", arr_dmmv_q4_k_q8_1_f32_len[reduc], arr_dmmv_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_q8_1_f32", arr_dmmv_q5_k_q8_1_f32_len[reduc], arr_dmmv_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_q8_1_f32", arr_dmmv_q6_k_q8_1_f32_len[reduc], arr_dmmv_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); - - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_q8_1_f32", arr_dmmv_iq1_s_q8_1_f32_len[reduc], arr_dmmv_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(i), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(i), i+1}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_q8_1_f32", arr_dmmv_iq1_m_q8_1_f32_len[reduc], arr_dmmv_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(i), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(i), i+1}, 1, true, use_subgroups, subgroup_size_int); - - } -#endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT - } - - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", arr_dmmv_id_f32_f32_f32_len[reduc], arr_dmmv_id_f32_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {wg_size_subgroup, 1}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", arr_dmmv_id_f16_f32_f32_len[reduc], arr_dmmv_id_f16_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {wg_size_subgroup, 2}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_BF16], "mul_mat_vec_id_bf16_f32", arr_dmmv_id_bf16_f32_f32_len[reduc], arr_dmmv_id_bf16_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {wg_size_subgroup, 2}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q1_0], "mul_mat_vec_id_q1_0_f32", arr_dmmv_id_q1_0_f32_f32_len[reduc], arr_dmmv_id_q1_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", arr_dmmv_id_q4_0_f32_f32_len[reduc], arr_dmmv_id_q4_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", arr_dmmv_id_q4_1_f32_f32_len[reduc], arr_dmmv_id_q4_1_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", arr_dmmv_id_q5_0_f32_f32_len[reduc], arr_dmmv_id_q5_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", arr_dmmv_id_q5_1_f32_f32_len[reduc], arr_dmmv_id_q5_1_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", arr_dmmv_id_q8_0_f32_f32_len[reduc], arr_dmmv_id_q8_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", arr_dmmv_id_q2_k_f32_f32_len[reduc16], arr_dmmv_id_q2_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", arr_dmmv_id_q3_k_f32_f32_len[reduc16], arr_dmmv_id_q3_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", arr_dmmv_id_q4_k_f32_f32_len[reduc16], arr_dmmv_id_q4_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", arr_dmmv_id_q5_k_f32_f32_len[reduc16], arr_dmmv_id_q5_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", arr_dmmv_id_q6_k_f32_f32_len[reduc16], arr_dmmv_id_q6_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_f32", arr_dmmv_id_iq1_s_f32_f32_len[reduc16], arr_dmmv_id_iq1_s_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_f32", arr_dmmv_id_iq1_m_f32_f32_len[reduc16], arr_dmmv_id_iq1_m_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ2_XXS], "mul_mat_vec_id_iq2_xxs_f32", arr_dmmv_id_iq2_xxs_f32_f32_len[reduc16], arr_dmmv_id_iq2_xxs_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ2_XS], "mul_mat_vec_id_iq2_xs_f32", arr_dmmv_id_iq2_xs_f32_f32_len[reduc16], arr_dmmv_id_iq2_xs_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ2_S], "mul_mat_vec_id_iq2_s_f32", arr_dmmv_id_iq2_s_f32_f32_len[reduc16], arr_dmmv_id_iq2_s_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", arr_dmmv_id_iq3_xxs_f32_f32_len[reduc16], arr_dmmv_id_iq3_xxs_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ3_S], "mul_mat_vec_id_iq3_s_f32", arr_dmmv_id_iq3_s_f32_f32_len[reduc16], arr_dmmv_id_iq3_s_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ4_XS], "mul_mat_vec_id_iq4_xs_f32", arr_dmmv_id_iq4_xs_f32_f32_len[reduc16], arr_dmmv_id_iq4_xs_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", arr_dmmv_id_iq4_nl_f32_f32_len[reduc16], arr_dmmv_id_iq4_nl_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_f32", arr_dmmv_id_mxfp4_f32_f32_len[reduc16], arr_dmmv_id_mxfp4_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_NVFP4], "mul_mat_vec_id_nvfp4_f32", arr_dmmv_id_nvfp4_f32_f32_len[reduc16], arr_dmmv_id_nvfp4_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); - -#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) - if (device->integer_dot_product) { - const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size; - const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4); - - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_q8_1_f32", arr_dmmv_id_q4_0_q8_1_f32_len[reduc], arr_dmmv_id_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_q8_1_f32", arr_dmmv_id_q4_1_q8_1_f32_len[reduc], arr_dmmv_id_q4_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_q8_1_f32", arr_dmmv_id_q5_0_q8_1_f32_len[reduc], arr_dmmv_id_q5_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_q8_1_f32", arr_dmmv_id_q5_1_q8_1_f32_len[reduc], arr_dmmv_id_q5_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_q8_1_f32", arr_dmmv_id_q8_0_q8_1_f32_len[reduc], arr_dmmv_id_q8_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); - - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_q8_1_f32", arr_dmmv_id_mxfp4_q8_1_f32_len[reduc], arr_dmmv_id_mxfp4_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); - - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_q8_1_f32", arr_dmmv_id_q2_k_q8_1_f32_len[reduc], arr_dmmv_id_q2_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_q8_1_f32", arr_dmmv_id_q3_k_q8_1_f32_len[reduc], arr_dmmv_id_q3_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); - - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_q8_1_f32", arr_dmmv_id_iq1_s_q8_1_f32_len[reduc], arr_dmmv_id_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_q8_1_f32", arr_dmmv_id_iq1_m_q8_1_f32_len[reduc], arr_dmmv_id_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int); - } -#endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT - } - -#if !defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) - GGML_UNUSED(rm_stdq_int); - GGML_UNUSED(rm_kq_int); - GGML_UNUSED(rm_iq_int); -#endif - - // dequant shaders - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q1_0], "dequant_q1_0", dequant_q1_0_len, dequant_q1_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 8, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ1_S], "dequant_iq1_s", dequant_iq1_s_len, dequant_iq1_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ1_M], "dequant_iq1_m", dequant_iq1_m_len, dequant_iq1_m_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_XXS], "dequant_iq2_xxs", dequant_iq2_xxs_len, dequant_iq2_xxs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_XS], "dequant_iq2_xs", dequant_iq2_xs_len, dequant_iq2_xs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_S], "dequant_iq2_s", dequant_iq2_s_len, dequant_iq2_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_XXS], "dequant_iq3_xxs", dequant_iq3_xxs_len, dequant_iq3_xxs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_S], "dequant_iq3_s", dequant_iq3_s_len, dequant_iq3_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_XS], "dequant_iq4_xs", dequant_iq4_xs_len, dequant_iq4_xs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_MXFP4], "dequant_mxfp4", dequant_mxfp4_len, dequant_mxfp4_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_NVFP4], "dequant_nvfp4", dequant_nvfp4_len, dequant_nvfp4_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); - - // get_rows - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_BF16], "get_rows_bf16", get_rows_bf16_len, get_rows_bf16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q1_0], "get_rows_q1_0", get_rows_q1_0_len, get_rows_q1_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q2_K], "get_rows_q2_k", get_rows_q2_k_len, get_rows_q2_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q3_K], "get_rows_q3_k", get_rows_q3_k_len, get_rows_q3_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_K], "get_rows_q4_k", get_rows_q4_k_len, get_rows_q4_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_K], "get_rows_q5_k", get_rows_q5_k_len, get_rows_q5_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q6_K], "get_rows_q6_k", get_rows_q6_k_len, get_rows_q6_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ1_S], "get_rows_iq1_s", get_rows_iq1_s_len, get_rows_iq1_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ1_M], "get_rows_iq1_m", get_rows_iq1_m_len, get_rows_iq1_m_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_XXS], "get_rows_iq2_xxs", get_rows_iq2_xxs_len, get_rows_iq2_xxs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_XS], "get_rows_iq2_xs", get_rows_iq2_xs_len, get_rows_iq2_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_S], "get_rows_iq2_s", get_rows_iq2_s_len, get_rows_iq2_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs", get_rows_iq3_xxs_len, get_rows_iq3_xxs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_S], "get_rows_iq3_s", get_rows_iq3_s_len, get_rows_iq3_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs", get_rows_iq4_xs_len, get_rows_iq4_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4], "get_rows_mxfp4", get_rows_mxfp4_len, get_rows_mxfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_NVFP4], "get_rows_nvfp4", get_rows_nvfp4_len, get_rows_nvfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32], "get_rows_i32", get_rows_i32_len, get_rows_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_BF16], "get_rows_bf16_f32", get_rows_bf16_f32_len, get_rows_bf16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q1_0], "get_rows_q1_0_f32", get_rows_q1_0_f32_len, get_rows_q1_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q2_K], "get_rows_q2_k_f32", get_rows_q2_k_f32_len, get_rows_q2_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q3_K], "get_rows_q3_k_f32", get_rows_q3_k_f32_len, get_rows_q3_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_K], "get_rows_q4_k_f32", get_rows_q4_k_f32_len, get_rows_q4_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_K], "get_rows_q5_k_f32", get_rows_q5_k_f32_len, get_rows_q5_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q6_K], "get_rows_q6_k_f32", get_rows_q6_k_f32_len, get_rows_q6_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ1_S], "get_rows_iq1_s_f32", get_rows_iq1_s_f32_len, get_rows_iq1_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ1_M], "get_rows_iq1_m_f32", get_rows_iq1_m_f32_len, get_rows_iq1_m_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_XXS], "get_rows_iq2_xxs_f32", get_rows_iq2_xxs_f32_len, get_rows_iq2_xxs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_XS], "get_rows_iq2_xs_f32", get_rows_iq2_xs_f32_len, get_rows_iq2_xs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_S], "get_rows_iq2_s_f32", get_rows_iq2_s_f32_len, get_rows_iq2_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs_f32", get_rows_iq3_xxs_f32_len, get_rows_iq3_xxs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_S], "get_rows_iq3_s_f32", get_rows_iq3_s_f32_len, get_rows_iq3_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs_f32", get_rows_iq4_xs_f32_len, get_rows_iq4_xs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_MXFP4], "get_rows_mxfp4_f32", get_rows_mxfp4_f32_len, get_rows_mxfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_NVFP4], "get_rows_nvfp4_f32", get_rows_nvfp4_f32_len, get_rows_nvfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, sizeof(vk_op_flash_attn_split_k_reduce_push_constants), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true); - - for (auto &it : device->pipeline_fa_mask_opt) { - auto BrBc = it.first; - ggml_vk_create_pipeline(device, it.second, "fa_mask_opt", fa_mask_opt_len, fa_mask_opt_data, "main", 2, sizeof(vk_op_flash_attn_mask_opt_push_constants), {1, 1, 1}, {128, 128 / device->subgroup_size, BrBc.first, BrBc.second}, 1, true, true, device->subgroup_size); - } - - if (device->subgroup_clustered && device->subgroup_require_full_support) { - ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true); - } else { - ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1); - } - - for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) { - if (device->subgroup_arithmetic && device->subgroup_require_full_support) { - ggml_vk_create_pipeline2(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_subgroup_add_len, mul_mat_vec_p021_f16_f32_subgroup_add_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_p021_push_constants), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true, true); - } else { - ggml_vk_create_pipeline2(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_p021_push_constants), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true); - } - } - ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_nc_push_constants), {1, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_f32, "rms_norm_mul_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_rms_norm_partials_f32, "rms_norm_partials_f32", rms_norm_partials_f32_len, rms_norm_partials_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_partials_f32, "rms_norm_mul_partials_f32", rms_norm_partials_f32_len, rms_norm_partials_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 1}, 1, true); - - if (sizeof(vk_op_rms_norm_mul_rope_push_constants) <= device->properties.limits.maxPushConstantsSize) { - ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_rope_f32_f32, "rms_norm_mul_rope_f32_f32", rms_norm_mul_rope_f32_f32_len, rms_norm_mul_rope_f32_f32_data, "main", 7, sizeof(vk_op_rms_norm_mul_rope_push_constants), {1, 1, 1}, {0, 1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_rope_f32_f16, "rms_norm_mul_rope_f32_f16", rms_norm_mul_rope_f32_f16_len, rms_norm_mul_rope_f32_f16_data, "main", 7, sizeof(vk_op_rms_norm_mul_rope_push_constants), {1, 1, 1}, {0, 1}, 1, true); - } - - ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f32, "cpy_f16_f32", cpy_f16_f32_len, cpy_f16_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_bf16,"cpy_f32_bf16",cpy_f32_bf16_len,cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_bf16_f32,"cpy_bf16_f32",cpy_bf16_f32_len,cpy_bf16_f32_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_i32_f32, "cpy_i32_f32", cpy_i32_f32_len, cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_i32, "cpy_f32_i32", cpy_f32_i32_len, cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f32, "contig_cpy_f16_f32", contig_cpy_f16_f32_len, contig_cpy_f16_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_bf16,"contig_cpy_f32_bf16",contig_cpy_f32_bf16_len,contig_cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_bf16_f32,"contig_cpy_bf16_f32",contig_cpy_bf16_f32_len,contig_cpy_bf16_f32_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_i32_f32, "contig_cpy_i32_f32", contig_cpy_i32_f32_len, contig_cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_i32, "contig_cpy_f32_i32", contig_cpy_f32_i32_len, contig_cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_32, "cpy_transpose_32", cpy_transpose_32_len, cpy_transpose_32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_16, "cpy_transpose_16", cpy_transpose_16_len, cpy_transpose_16_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q1_0], "cpy_f32_q1_0", cpy_f32_q1_0_len, cpy_f32_q1_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); - -#define SET_ROWS(itype) \ - ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F32], "set_rows_f32" #itype, set_rows_f32 ## itype ## _len, set_rows_f32 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ - ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F16], "set_rows_f16" #itype, set_rows_f16 ## itype ## _len, set_rows_f16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ - ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_BF16], "set_rows_bf16" #itype, set_rows_bf16 ## itype ## _len, set_rows_bf16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ - ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q1_0], "set_rows_q1_0" #itype, set_rows_q1_0 ## itype ## _len, set_rows_q1_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ - ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_0], "set_rows_q4_0" #itype, set_rows_q4_0 ## itype ## _len, set_rows_q4_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ - ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_1], "set_rows_q4_1" #itype, set_rows_q4_1 ## itype ## _len, set_rows_q4_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ - ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_0], "set_rows_q5_0" #itype, set_rows_q5_0 ## itype ## _len, set_rows_q5_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ - ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_1], "set_rows_q5_1" #itype, set_rows_q5_1 ## itype ## _len, set_rows_q5_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ - ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q8_0], "set_rows_q8_0" #itype, set_rows_q8_0 ## itype ## _len, set_rows_q8_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ - ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_IQ4_NL], "set_rows_iq4_nl" #itype, set_rows_iq4_nl ## itype ## _len, set_rows_iq4_nl ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); - - SET_ROWS(_i32) - SET_ROWS(_i64) -#undef SET_ROWS - - - ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q1_0], "cpy_q1_0_f32", cpy_q1_0_f32_len, cpy_q1_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q1_0), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_1], "cpy_q4_1_f32", cpy_q4_1_f32_len, cpy_q4_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_0], "cpy_q5_0_f32", cpy_q5_0_f32_len, cpy_q5_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_1], "cpy_q5_1_f32", cpy_q5_1_f32_len, cpy_q5_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q8_0], "cpy_q8_0_f32", cpy_q8_0_f32_len, cpy_q8_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_IQ4_NL], "cpy_iq4_nl_f32", cpy_iq4_nl_f32_len, cpy_iq4_nl_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1); - - auto get_suffix = [](bool src0_f16, bool src1_f16, bool dst_f16) { - std::string s; - s += std::string(src0_f16 ? "_f16" : "_f32"); - s += std::string(src1_f16 ? "_f16" : "_f32"); - s += std::string(dst_f16 ? "_f16" : "_f32"); - return s; - }; - -#define CREATE_BINARY(name, namemod, spec, bindings) \ - for (int s0 : {0,1}) for (int s1 : {0,1}) for (int d : {0,1}) \ - ggml_vk_create_pipeline2(device, device->pipeline_ ## name ## namemod[s0][s1][d], \ - #name + get_suffix(s0, s1, d) + #namemod, name ## _len[s0][s1][d], name ## _data[s0][s1][d], \ - "main", (bindings), sizeof(vk_op_binary_push_constants), {512, 1, 1}, spec, 1); - - CREATE_BINARY(add, , {0}, 4) - CREATE_BINARY(add, _norepeat, {1}, 4) - CREATE_BINARY(sub, , {0}, 3) - CREATE_BINARY(sub, _norepeat, {1}, 3) - CREATE_BINARY(mul, , {0}, 3) - CREATE_BINARY(mul, _norepeat, {1}, 3) - CREATE_BINARY(div, , {0}, 3) - CREATE_BINARY(div, _norepeat, {1}, 3) - CREATE_BINARY(add_rms, , {0}, 4) - CREATE_BINARY(add_rms, _norepeat, {1}, 4) -#undef CREATE_BINARY - - if (device->multi_add) { - for (uint32_t i = 0; i < MAX_FUSED_ADDS; ++i) { - ggml_vk_create_pipeline2(device, device->pipeline_multi_add[i], "multi_add_f32_" + std::to_string(i+1), multi_add_f32_len, multi_add_f32_data, "main", MAX_PARAMETER_COUNT, sizeof(vk_op_multi_add_push_constants), {512, 1, 1}, {i+2}, 1); - ggml_vk_create_pipeline2(device, device->pipeline_multi_add_rms[i], "multi_add_rms_f32_" + std::to_string(i+1), multi_add_rms_f32_len, multi_add_rms_f32_data, "main", MAX_PARAMETER_COUNT, sizeof(vk_op_multi_add_push_constants), {512, 1, 1}, {i+2}, 1); - } - } - - ggml_vk_create_pipeline(device, device->pipeline_add_id_f32, "add_id_f32", add_id_f32_len, add_id_f32_data, "main", 4, sizeof(vk_op_add_id_push_constants), {1, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0, 1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_set_f32, "set_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0, 0}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_concat_i8, "concat_i8", concat_i8_len, concat_i8_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_concat_i16, "concat_i16", concat_i16_len, concat_i16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_concat_i64, "concat_i64", concat_i64_len, concat_i64_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1); - ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1); - ggml_vk_create_pipeline(device, device->pipeline_upscale_bicubic_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BICUBIC}, 1); - ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_antialias_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_sqrt_f32, "sqrt_f32", sqrt_f32_len, sqrt_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_log[0], "log_f32", log_f32_len, log_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_log[1], "log_f16", log_f16_len, log_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_diag[0], "diag_f32", diag_f32_len, diag_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_diag[1], "diag_f16", diag_f16_len, diag_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_repeat_i32, "repeat_i32", repeat_i32_len, repeat_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_repeat_i16, "repeat_i16", repeat_i16_len, repeat_i16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - -#define CREATE_UNARY(name) \ - ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); \ - ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - - CREATE_UNARY(elu) - CREATE_UNARY(gelu) - CREATE_UNARY(gelu_erf) - CREATE_UNARY(gelu_quick) - CREATE_UNARY(silu) - CREATE_UNARY(relu) - CREATE_UNARY(xielu) - CREATE_UNARY(neg) - CREATE_UNARY(tanh) - CREATE_UNARY(sigmoid) - CREATE_UNARY(hardsigmoid) - CREATE_UNARY(hardswish) - CREATE_UNARY(abs) - CREATE_UNARY(softplus) - CREATE_UNARY(step) - CREATE_UNARY(round) - CREATE_UNARY(ceil) - CREATE_UNARY(floor) - CREATE_UNARY(trunc) - CREATE_UNARY(sgn) - CREATE_UNARY(exp) - CREATE_UNARY(expm1) -#undef CREATE_UNARY - - ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f16, "add1_f16_f16", add1_f16_f16_len, add1_f16_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f32, "add1_f16_f32", add1_f16_f32_len, add1_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_add1_f32_f32, "add1_f32_f32", add1_f32_f32_len, add1_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_fill_f16, "fill_f16", fill_f16_len, fill_f16_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - -#define CREATE_GLU(name) \ - ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \ - ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); - - CREATE_GLU(geglu) - CREATE_GLU(reglu) - CREATE_GLU(swiglu) - CREATE_GLU(swiglu_oai) - CREATE_GLU(geglu_erf) - CREATE_GLU(geglu_quick) -#undef CREATE_GLU - - ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {1, 512, 1}, {}, 1, true); - - ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_wg512, "soft_max_f32_wg512", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true); - - ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32, "soft_max_large1_f32", soft_max_large1_f32_len, soft_max_large1_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32, "soft_max_large2_f32", soft_max_large2_f32_len, soft_max_large2_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32, "soft_max_large3_f32", soft_max_large3_f32_len, soft_max_large3_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32_f16, "soft_max_large1_f32_f16", soft_max_large1_f32_f16_len, soft_max_large1_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32_f16, "soft_max_large2_f32_f16", soft_max_large2_f32_f16_len, soft_max_large2_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32_f16, "soft_max_large3_f32_f16", soft_max_large3_f32_f16_len, soft_max_large3_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); - - ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f32, "rope_vision_f32", rope_vision_f32_len, rope_vision_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f16, "rope_multi_f16", rope_multi_f16_len, rope_multi_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f16, "rope_vision_f16", rope_vision_f16_len, rope_vision_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32_f16, "rope_norm_f32_f16", rope_norm_f32_f16_len, rope_norm_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32_f16, "rope_neox_f32_f16", rope_neox_f32_f16_len, rope_neox_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32_f16, "rope_multi_f32_f16", rope_multi_f32_f16_len, rope_multi_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - - for (uint32_t i = 0; i < num_argsort_pipelines; ++i) { - uint32_t BLOCK_SIZE = 1u << std::min(i, device->max_workgroup_size_log2); - if (i <= device->max_workgroup_size_log2 && - 2 * sizeof(int) * BLOCK_SIZE <= device->properties.limits.maxComputeSharedMemorySize) { - const uint32_t NCOLS_PADDED_LOG2 = i; - ggml_vk_create_pipeline2(device, device->pipeline_argsort_f32[i], "argsort_f32_"+std::to_string(i), argsort_f32_len, argsort_f32_data, "main", 3, sizeof(vk_op_argsort_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, NCOLS_PADDED_LOG2}, 1, true); - } - const uint32_t WG_UNROLL_FACTOR = BLOCK_SIZE > 1 ? 2 : 1; - BLOCK_SIZE /= WG_UNROLL_FACTOR; - ggml_vk_create_pipeline2(device, device->pipeline_argsort_large_f32[i], "argsort_large_f32_"+std::to_string(i), argsort_large_f32_len, argsort_large_f32_data, "main", 3, sizeof(vk_op_argsort_push_constants), {BLOCK_SIZE * WG_UNROLL_FACTOR, 1, 1}, {BLOCK_SIZE, WG_UNROLL_FACTOR}, 1, true); - } - - for (uint32_t i = 0; i < num_topk_pipelines; ++i) { - const uint32_t BLOCK_SIZE = 1u << i; - const uint32_t NCOLS_PADDED_LOG2 = i; - if (i <= device->max_workgroup_size_log2) { - uint32_t nary_shmem = 2 * sizeof(int) * BLOCK_SIZE + - sizeof(int) * device->subgroup_size + - 2 * sizeof(int) + - 2 * (BLOCK_SIZE / device->subgroup_size) * sizeof(int); - if (device->subgroup_arithmetic && device->subgroup_require_full_support && device->subgroup_shuffle && device->subgroup_ballot && - nary_shmem <= device->properties.limits.maxComputeSharedMemorySize) { - ggml_vk_create_pipeline2(device, device->pipeline_topk_f32[i], "topk_f32_"+std::to_string(i), topk_nary_search_f32_len, topk_nary_search_f32_data, "main", 2, sizeof(vk_op_topk_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, device->subgroup_size, device->subgroup_size_log2}, 1, true, true, device->subgroup_size); - } else if (2 * sizeof(int) * BLOCK_SIZE <= device->properties.limits.maxComputeSharedMemorySize) { - ggml_vk_create_pipeline2(device, device->pipeline_topk_f32[i], "topk_f32_"+std::to_string(i), topk_argsort_f32_len, topk_argsort_f32_data, "main", 2, sizeof(vk_op_topk_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, NCOLS_PADDED_LOG2}, 1, true); - } - } - } - - ggml_vk_create_pipeline(device, device->pipeline_argmax_f32, "argmax_f32", argmax_f32_len, argmax_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - - ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - // Intel Arc B390 was observed segfaulting with this shader. - if (device->subgroup_basic && device->subgroup_shuffle && device->vendor_id != VK_VENDOR_ID_INTEL) { - int idx = 0; - for (uint32_t n : {64, 128, 256, 512}) { - if (device->subgroup_size <= n) { - ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_f32", fwht_f32_len, fwht_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { device->subgroup_size, n }, 1, true, true, device->subgroup_size); - } - ++idx; - } - } else if (device->driver_id != vk::DriverId::eIntelProprietaryWindows) { - // Disabled on Intel Windows due to a driver bug: https://github.com/ggml-org/llama.cpp/pull/23964#issuecomment-4598226147 - int idx = 0; - for (uint32_t n : {64, 128, 256, 512}) { - const uint32_t block_size = std::min(device->subgroup_size, n); - ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_shmem_f32", fwht_shmem_f32_len, fwht_shmem_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { block_size, n }, 1); - ++idx; - } - } - - const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4; - ggml_vk_create_pipeline(device, device->pipeline_cumsum_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 256, device->subgroup_size, cumsum_elem_per_thread }, 1, true, true, device->subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_cumsum_small_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 128, device->subgroup_size, 1 }, 1, true, true, device->subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass1_f32, "cumsum_multipass1_f32", cumsum_multipass1_f32_len, cumsum_multipass1_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, device->subgroup_size }, 1, true, true, device->subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass2_f32, "cumsum_multipass2_f32", cumsum_multipass2_f32_len, cumsum_multipass2_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, device->subgroup_size }, 1, true, true, device->subgroup_size); - - ggml_vk_create_pipeline(device, device->pipeline_count_equal_i32, "count_equal_i32", count_equal_i32_len, count_equal_i32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, { device->subgroup_size }, 1); - - ggml_vk_create_pipeline(device, device->pipeline_count_experts, "count_experts", count_experts_len, count_experts_data, "main", 2, sizeof(vk_op_count_experts_push_constants), {1, 1, 1}, {}, 1, true); - - for (auto &s : device->pipeline_solve_tri_f32) { - const vk_solve_tri_pipeline_state &state = s.first; - - // Max number of rows to load at a time, limited by shared memory - const uint32_t batch_N = device->properties.limits.maxComputeSharedMemorySize / ((state.N + state.K) * sizeof(float)); - // Need at least K invocations, and prefer a minimum of 128 to spread out loading shared memory - const uint32_t block_size = std::max(128u, 1u << (uint32_t)ceilf(log2f(float(state.K)))); - - ggml_vk_create_pipeline( - device, s.second, "solve_tri_f32", - solve_tri_f32_len, solve_tri_f32_data, "main", 3, - sizeof(vk_op_binary_push_constants), {1, 1, 1}, { 0, state.N, state.K, batch_N, block_size }, 1, true); - } - -#define IM2COL(bda) \ - ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32 ## bda ## _len, im2col_f32 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \ - ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32, "im2col_3d_f32", im2col_3d_f32 ## bda ## _len, im2col_3d_f32 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); \ - ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16 ## bda ## _len, im2col_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \ - ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32_f16, "im2col_3d_f32_f16", im2col_3d_f32_f16 ## bda ## _len, im2col_3d_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); - if (device->shader_int64 && device->buffer_device_address) { - IM2COL(_bda) - } else { - IM2COL() - } - - ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_f32, "col2im_1d_f32", col2im_1d_f32_len, col2im_1d_f32_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_f16, "col2im_1d_f16", col2im_1d_f16_len, col2im_1d_f16_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_bf16, "col2im_1d_bf16", col2im_1d_bf16_len, col2im_1d_bf16_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true); - - ggml_vk_create_pipeline(device, device->pipeline_snake_f32, "snake_f32", snake_f32_len, snake_f32_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_snake_f16, "snake_f16", snake_f16_len, snake_f16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_snake_bf16, "snake_bf16", snake_bf16_len, snake_bf16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - - { - const uint32_t gdn_sizes[] = {16, 32, 64, 128}; - const char * gdn_names[][2] = { - {"gated_delta_net_f32_d16", "gated_delta_net_f32_d16_kda"}, - {"gated_delta_net_f32_d32", "gated_delta_net_f32_d32_kda"}, - {"gated_delta_net_f32_d64", "gated_delta_net_f32_d64_kda"}, - {"gated_delta_net_f32_d128", "gated_delta_net_f32_d128_kda"}, - }; - for (uint32_t si = 0; si < 4; si++) { - const uint32_t S_V = gdn_sizes[si]; - GGML_ASSERT(is_pow2(S_V)); - - uint32_t lanes_per_column; - if (S_V >= 128u && device->subgroup_clustered) { - lanes_per_column = 8u; - } else { - // Use largest power-of-two that divides both S_V and subgroup_size so that - // (1) S_V % lanes_per_column == 0 and (2) S_V % (subgroup_size / lanes_per_column) == 0. - // This means we don't need extra bounds checking logic in the shader. - lanes_per_column = std::min(S_V, device->subgroup_size); - } - - // gated_delta_net.comp relies on S_V % COLS_PER_WG == 0 and - // S_V % LANES_PER_COLUMN == 0 to avoid bounds checks. - while (lanes_per_column > 1u) { - const bool valid_lanes = (device->subgroup_size % lanes_per_column) == 0 && - (S_V % lanes_per_column) == 0; - const uint32_t cols_per_wg = valid_lanes ? device->subgroup_size / lanes_per_column : 0; - if (valid_lanes && cols_per_wg > 0 && (S_V % cols_per_wg) == 0) { - break; - } - lanes_per_column >>= 1u; - } - - GGML_ASSERT((device->subgroup_size % lanes_per_column) == 0); - GGML_ASSERT((S_V % lanes_per_column) == 0); - GGML_ASSERT((S_V % (device->subgroup_size / lanes_per_column)) == 0); - - const bool need_partial_subgroup_reduce = lanes_per_column != 1u && lanes_per_column < device->subgroup_size; - const bool use_clustered_reduce = device->subgroup_arithmetic && device->subgroup_clustered && need_partial_subgroup_reduce; - const bool use_subgroup_reduce = device->subgroup_arithmetic && !need_partial_subgroup_reduce; - const bool use_subgroup_ops = use_clustered_reduce || use_subgroup_reduce; - size_t gdn_len; - const void * gdn_data; - if (use_clustered_reduce) { - gdn_len = gated_delta_net_f32_len; - gdn_data = (const void *)gated_delta_net_f32_data; - } else if (use_subgroup_reduce) { - gdn_len = gated_delta_net_f32_nocluster_len; - gdn_data = (const void *)gated_delta_net_f32_nocluster_data; - } else { - gdn_len = gated_delta_net_f32_shmem_len; - gdn_data = (const void *)gated_delta_net_f32_shmem_data; - } - - const uint32_t cols_per_wg = device->subgroup_size / lanes_per_column; - const std::array wg_denoms = {1u, 1u, cols_per_wg}; - - for (uint32_t kda = 0; kda < 2; kda++) { - ggml_vk_create_pipeline(device, device->pipeline_gated_delta_net[si][kda], - gdn_names[si][kda], gdn_len, gdn_data, "main", 7, sizeof(vk_op_gated_delta_net_push_constants), - wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_ops, device->subgroup_size); - } - } - } - - if (device->subgroup_arithmetic && device->subgroup_require_full_support) { - ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size}, 1, true, true); - ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size}, 1, true, true); - } else { - ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true); - ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true); - } - - ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_f32, "ssm_conv_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 4, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16, 0, 0}, 1); - ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_silu_f32, "ssm_conv_silu_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 4, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16, 0, 1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_bias_silu_f32, "ssm_conv_bias_silu_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 4, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16, 1, 1}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - - ggml_vk_create_pipeline(device, device->pipeline_opt_step_sgd_f32, "opt_step_sgd_f32", opt_step_sgd_f32_len, opt_step_sgd_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - - // conv2d, conv_transpose_2d - for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) { - // smaller WG for the small-tile fallback gives more concurrent WGs per SM - uint32_t conv2d_WG_SIZE = (s == CONV_SHAPE_64x32) ? 128 : 256; - uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices. - uint32_t conv2d_TS_K = (s == CONV_SHAPE_64x32) ? 4 : 8; - uint32_t conv2d_SHMEM_PAD = 4; - vk_conv_block_size conv2d_BS = vk_conv_block_sizes[s]; - bool conv2d_UNROLL = true; - -#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) - if (device->coopmat2) { - conv2d_SHMEM_PAD = 8; // 8 float16_t - } -#endif - - if (device->vendor_id == VK_VENDOR_ID_INTEL) { - conv2d_SHMEM_PAD = 0; - conv2d_UNROLL = false; - } else if (device->vendor_id == VK_VENDOR_ID_AMD) { - conv2d_SHMEM_PAD = device->architecture == vk_device_architecture::AMD_GCN ? 1 : 4; - if (s == CONV_SHAPE_128x128 && device->architecture != vk_device_architecture::AMD_GCN) { - conv2d_UNROLL = false; - } - } - - // Use collectives on pre-Turing NVIDIA GPUs and GCN AMD cards, which had slower integer math. - bool allow_collectives_nv = device->vendor_id != VK_VENDOR_ID_NVIDIA || - device->architecture == vk_device_architecture::NVIDIA_PRE_TURING; - bool allow_collectives_amd = device->vendor_id != VK_VENDOR_ID_AMD || - device->architecture == vk_device_architecture::AMD_GCN; - - if (device->subgroup_shuffle && - device->vendor_id != VK_VENDOR_ID_INTEL && // Do not enable collectives on Intel, see PR 14316. - allow_collectives_nv && - allow_collectives_amd) { - use_collectives = 1; - conv2d_BS.CRS = std::min( - device->subgroup_size, - conv2d_BS.CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used. - } - - // cm1 is used only when cm2 is unavailable; capped at 64x128 (due to shared memory size). - // Requires 16x16x16 f16-acc since that's the fragment shape hard-coded in the shader. - // Subgroup size must be 32 or 64 (to keep WG_SIZE sane) and we need - // subgroup_size_control to force the driver to actually use it. - bool conv2d_use_cm1 = false; -#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) - conv2d_use_cm1 = !device->coopmat2 && - device->coopmat_support && device->coopmat_support_16x16x16_f16acc && - device->subgroup_size_control && - (device->subgroup_size == 32 || device->subgroup_size == 64) && - s != CONV_SHAPE_128x128; -#endif - - const uint32_t conv2d_cm1_shmem_pad = 8; - - auto shmem_req = [&](uint32_t pad, bool csh_store, bool fp16_shmem) { - const uint32_t elem_size = fp16_shmem ? (uint32_t)sizeof(uint16_t) : (uint32_t)sizeof(float); - const uint32_t csh_elems = csh_store ? conv2d_BS.K * conv2d_BS.NPQ : 0u; - return (conv2d_BS.K * (conv2d_BS.CRS + pad) + conv2d_BS.CRS * (conv2d_BS.NPQ + pad) + csh_elems) * elem_size; - }; - - // coopmat1 needs to store the output through shared memory, so check up front - // whether it'll fit and disable it before applying coopmat1 parameters. - if (conv2d_use_cm1 && device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_cm1_shmem_pad, true, true)) { - conv2d_use_cm1 = false; - } - - uint32_t conv2d_WM = 16, conv2d_WN = 16; // cm1 subgroup tile, ignored otherwise - if (conv2d_use_cm1) { - conv2d_SHMEM_PAD = conv2d_cm1_shmem_pad; - // 16x16x16 fragments; pick WM/WN to keep WG_SIZE at 256 - // (i.e. 8 subgroups for sg=32, 4 subgroups for sg=64). - const bool sg64 = (device->subgroup_size == 64); - switch (s) { - case CONV_SHAPE_64x32: conv2d_WM = sg64 ? 32 : 16; conv2d_WN = 16; break; - case CONV_SHAPE_64x128: conv2d_WM = 32; conv2d_WN = sg64 ? 64 : 32; break; - case CONV_SHAPE_32x256: conv2d_WM = sg64 ? 16 : 32; conv2d_WN = sg64 ? 128 : 32; break; - default: break; - } - const uint32_t warps_M = conv2d_BS.K / conv2d_WM; - const uint32_t warps_N = conv2d_BS.NPQ / conv2d_WN; - conv2d_WG_SIZE = warps_M * warps_N * device->subgroup_size; - } - - // stage cm2 accumulator through shmem for coalesced global stores; - // skipped on 128x128 where the extra Csh footprint hurts occupancy. - // cm1 always uses the staged path. - uint32_t conv2d_csh_store = (device->coopmat2 && s != CONV_SHAPE_128x128) ? 1u : 0u; - if (conv2d_use_cm1) { - conv2d_csh_store = 1; - } - - // shmem is fp16 on cm2/cm1 (matches Csh), fp32 on scalar - const bool conv2d_use_fp16_shmem = device->coopmat2 || conv2d_use_cm1; - - // shrink CRS if the non-cm1 config still doesn't fit - if (device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_SHMEM_PAD, conv2d_csh_store, conv2d_use_fp16_shmem)) { - GGML_ASSERT(!conv2d_use_cm1); - conv2d_BS.CRS = 8; - if (use_collectives) { - conv2d_BS.CRS = std::min(device->subgroup_size, conv2d_BS.CRS); - } - conv2d_csh_store = 0; - } - - std::array wg_denoms = { conv2d_BS.K, 1, 1 }; - std::vector spec_constants = { conv2d_WG_SIZE, conv2d_BS.K, conv2d_BS.CRS, conv2d_BS.NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD }; - - // cm1 needs a fixed subgroup width to match the WG_SIZE we computed - const uint32_t conv2d_required_subgroup_size = conv2d_use_cm1 ? device->subgroup_size : 0; - -#define CREATE_CONV(name, type_suffix, spv_suffix) \ - for (auto &c : device->pipeline_##name##type_suffix[s]) { \ - const vk_conv2d_pipeline_state &state = c.first; \ - std::vector spec_constants_cpy = spec_constants; \ - spec_constants_cpy.push_back(state.s0); \ - spec_constants_cpy.push_back(state.s1); \ - spec_constants_cpy.push_back(state.p0); \ - spec_constants_cpy.push_back(state.p1); \ - spec_constants_cpy.push_back(state.d0); \ - spec_constants_cpy.push_back(state.d1); \ - spec_constants_cpy.push_back(state.KW); \ - spec_constants_cpy.push_back(state.KH); \ - spec_constants_cpy.push_back(state.aligned); \ - spec_constants_cpy.push_back(conv2d_csh_store); \ - spec_constants_cpy.push_back(conv2d_WM); \ - spec_constants_cpy.push_back(conv2d_WN); \ - ggml_vk_create_pipeline( \ - device, c.second, #name #type_suffix, \ - name##type_suffix##spv_suffix##_len, name##type_suffix##spv_suffix##_data, "main", 3, \ - sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives || conv2d_required_subgroup_size, conv2d_required_subgroup_size); \ - } -#define CREATE_CONVS(spv_suffix) \ - CREATE_CONV(conv2d, _f32, spv_suffix) \ - CREATE_CONV(conv2d, _f16_f32, spv_suffix) \ - CREATE_CONV(conv_transpose_2d, _f32, spv_suffix) \ - CREATE_CONV(conv_transpose_2d, _f16_f32, spv_suffix) -#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) - if (device->coopmat2) { - CREATE_CONVS(_cm2) - } else -#endif -#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) - if (conv2d_use_cm1) { - CREATE_CONVS(_cm1) - } else -#endif - if (conv2d_UNROLL) { - CREATE_CONVS(_unroll) - } else { - CREATE_CONVS( ) - } -#undef CREATE_CONV -#undef CREATE_CONVS - } - - ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); - - for (uint32_t use_push = 0; use_push < 2; ++use_push) { - for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) { - ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][use_push], "topk_moe_f32_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 4, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); - } - } - - // Drop compile_mutex so other threads can walk while we compile. - compile_lock.unlock(); - - // Compile what we claimed; create_pipeline_func reacquires compile_mutex - // at the end to flip compile_pending/compiled and notify waiters. - if (has_claimed_task) { - auto & task = claimed_task; - ggml_vk_create_pipeline_func(device, task.pipeline, task.spv_size, task.spv_data, - task.entrypoint, task.parameter_count, task.wg_denoms, - task.specialization_constants, task.disable_robustness, - task.require_full_subgroups, task.required_subgroup_size); - } - - // Another thread may be compiling the pipeline we need; block on it here. - if (wait_pipeline) { - std::unique_lock wait_lock(device->compile_mutex); - device->compile_cv.wait(wait_lock, [&] { - return wait_pipeline->compiled.load(); - }); - } -} - -static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch); -static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev); - -static vk_device ggml_vk_get_device(size_t idx) { - VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")"); - - if (vk_instance.devices[idx] == nullptr) { - VK_LOG_DEBUG("Initializing new vk_device"); - vk_device device = std::make_shared(); - vk_instance.devices[idx] = device; - - device->memory_logger = std::unique_ptr(new vk_memory_logger()); - - size_t dev_num = vk_instance.device_indices[idx]; - - std::vector physical_devices = vk_instance.instance.enumeratePhysicalDevices(); - - if (dev_num >= physical_devices.size()) { - std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl; - throw std::runtime_error("Device not found"); - } - - device->physical_device = physical_devices[dev_num]; - const std::vector ext_props = device->physical_device.enumerateDeviceExtensionProperties(); - - device->architecture = get_device_architecture(device->physical_device); - - const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY"); - device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr; - - const char* GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM = getenv("GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM"); - device->disable_host_visible_vidmem = GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM != nullptr; - - const char* GGML_VK_ALLOW_SYSMEM_FALLBACK = getenv("GGML_VK_ALLOW_SYSMEM_FALLBACK"); - device->allow_sysmem_fallback = GGML_VK_ALLOW_SYSMEM_FALLBACK != nullptr; - - const char* GGML_VK_DISABLE_GRAPH_OPTIMIZE = getenv("GGML_VK_DISABLE_GRAPH_OPTIMIZE"); - device->disable_graph_optimize = GGML_VK_DISABLE_GRAPH_OPTIMIZE != nullptr; - - bool fp16_storage = false; - bool fp16_compute = false; - bool maintenance4_support = false; - bool sm_builtins = false; - bool amd_shader_core_properties2 = false; - bool pipeline_robustness = false; - bool coopmat2_support = false; - bool coopmat2_decode_vector_support = false; - bool pipeline_executable_properties_support = false; - device->coopmat_support = false; - device->integer_dot_product = false; - device->shader_64b_indexing = false; - bool bfloat16_support = false; - bool dot2_f16_support = false; - - for (const auto& properties : ext_props) { - if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { - maintenance4_support = true; - } else if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { - fp16_storage = true; - } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { - fp16_compute = true; - } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) { - sm_builtins = true; - } else if (strcmp("VK_AMD_shader_core_properties2", properties.extensionName) == 0) { - amd_shader_core_properties2 = true; - } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { - pipeline_robustness = true; - } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { - device->subgroup_size_control = true; -#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) - } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && - !getenv("GGML_VK_DISABLE_COOPMAT")) { - device->coopmat_support = true; - device->coopmat_m = 0; - device->coopmat_n = 0; - device->coopmat_k = 0; -#endif -#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) - } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && - !getenv("GGML_VK_DISABLE_COOPMAT2")) { - coopmat2_support = true; -#endif - } else if (strcmp(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME, properties.extensionName) == 0 && - !getenv("GGML_VK_DISABLE_COOPMAT2_DECODE_VECTOR")) { - coopmat2_decode_vector_support = true; -#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) - } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 && - !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) { - device->integer_dot_product = true; -#endif -#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) - } else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 && - !getenv("GGML_VK_DISABLE_BFLOAT16")) { - bfloat16_support = true; -#endif - } else if (strcmp("VK_VALVE_shader_mixed_float_dot_product", properties.extensionName) == 0 && - !getenv("GGML_VK_DISABLE_DOT2")) { - dot2_f16_support = true; - } else if (strcmp("VK_KHR_pipeline_executable_properties", properties.extensionName) == 0) { - pipeline_executable_properties_support = true; - } else if (strcmp("VK_EXT_memory_priority", properties.extensionName) == 0 && - getenv("GGML_VK_ENABLE_MEMORY_PRIORITY")) { - device->memory_priority = true; - } else if (strcmp("VK_EXT_external_memory_host", properties.extensionName) == 0) { - device->external_memory_host = true; -#if defined(VK_EXT_shader_64bit_indexing) - } else if (strcmp("VK_EXT_shader_64bit_indexing", properties.extensionName) == 0) { - device->shader_64b_indexing = true; -#endif - } - } - - vk::PhysicalDeviceProperties2 props2; - vk::PhysicalDeviceMaintenance3Properties props3; - vk::PhysicalDeviceMaintenance4Properties props4; - vk::PhysicalDeviceSubgroupProperties subgroup_props; - vk::PhysicalDeviceDriverProperties driver_props; - vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; - vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props; - vk::PhysicalDeviceVulkan11Properties vk11_props; - vk::PhysicalDeviceVulkan12Properties vk12_props; - vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; - vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR shader_integer_dot_product_props; - vk::PhysicalDeviceExternalMemoryHostPropertiesEXT external_memory_host_props; - - props2.pNext = &props3; - props3.pNext = &subgroup_props; - subgroup_props.pNext = &driver_props; - driver_props.pNext = &vk11_props; - vk11_props.pNext = &vk12_props; - - VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_props; - - if (maintenance4_support) { - last_struct->pNext = (VkBaseOutStructure *)&props4; - last_struct = (VkBaseOutStructure *)&props4; - } - if (sm_builtins) { - last_struct->pNext = (VkBaseOutStructure *)&sm_props; - last_struct = (VkBaseOutStructure *)&sm_props; - } - if (amd_shader_core_properties2) { - last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props; - last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props; - } - if (device->subgroup_size_control) { - last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_props; - last_struct = (VkBaseOutStructure *)&subgroup_size_control_props; - } - -#if defined(VK_NV_cooperative_matrix2) - vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props; - if (coopmat2_support) { - last_struct->pNext = (VkBaseOutStructure *)&coopmat2_props; - last_struct = (VkBaseOutStructure *)&coopmat2_props; - } -#endif - - if (device->integer_dot_product) { - last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_props; - last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_props; - } - - if (device->external_memory_host) { - last_struct->pNext = (VkBaseOutStructure *)&external_memory_host_props; - last_struct = (VkBaseOutStructure *)&external_memory_host_props; - } - - device->physical_device.getProperties2(&props2); - device->properties = props2.properties; - device->vendor_id = device->properties.vendorID; - device->driver_id = driver_props.driverID; - - if (device->driver_id == vk::DriverId::eMoltenvk) { - // Disable external_memory_host until https://github.com/KhronosGroup/MoltenVK/pull/2622 - // is available in the Vulkan SDK. - device->external_memory_host = false; - } - - // Implementing the async backend interfaces seems broken on older Intel HW, - // see https://github.com/ggml-org/llama.cpp/issues/17302. - device->support_async = (device->vendor_id != VK_VENDOR_ID_INTEL || - std::string(device->properties.deviceName.data()).find("(DG1)") == std::string::npos) && - getenv("GGML_VK_DISABLE_ASYNC") == nullptr; - - if (!device->support_async) { - GGML_LOG_DEBUG("ggml_vulkan: WARNING: Async execution disabled on certain Intel devices.\n"); - } - - const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE"); - - if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) { - device->max_memory_allocation_size = std::stoull(GGML_VK_FORCE_MAX_ALLOCATION_SIZE); - } else if (maintenance4_support) { - device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); - } else { - device->max_memory_allocation_size = props3.maxMemoryAllocationSize; - } - - const char* GGML_VK_FORCE_MAX_BUFFER_SIZE = getenv("GGML_VK_FORCE_MAX_BUFFER_SIZE"); - - if (GGML_VK_FORCE_MAX_BUFFER_SIZE != nullptr) { - device->max_buffer_size = std::stoull(GGML_VK_FORCE_MAX_BUFFER_SIZE); - } else if (maintenance4_support) { - device->max_buffer_size = props4.maxBufferSize; - } else { - device->max_buffer_size = device->max_memory_allocation_size; - } - - const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE"); - - if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) { - device->suballocation_block_size = std::stoull(GGML_VK_SUBALLOCATION_BLOCK_SIZE); - } else { - // Limit batching of allocations to 1GB by default to avoid fragmentation issues - device->suballocation_block_size = 1024*1024*1024; - } - device->suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size); - - device->subgroup_size = subgroup_props.subgroupSize; - device->subgroup_size_log2 = uint32_t(log2f(float(device->subgroup_size))); - device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; - if (sm_builtins) { - device->shader_core_count = sm_props.shaderSMCount; - } else if (amd_shader_core_properties2) { - device->shader_core_count = amd_shader_core_properties2_props.activeComputeUnitCount; - } else if (device->vendor_id == VK_VENDOR_ID_INTEL) { - device->shader_core_count = ggml_vk_intel_shader_core_count(device->physical_device); - } else { - device->shader_core_count = 0; - } - device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16; - - device->subgroup_basic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && - (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eBasic); - device->subgroup_arithmetic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && - (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic); -#ifdef __APPLE__ - // Workaround for subgroup arithmetic failing on MoltenVK with AMD GPUs (issue 15846) - if (device->vendor_id == VK_VENDOR_ID_AMD) { - device->subgroup_arithmetic = false; - } -#endif - device->subgroup_shuffle = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && - (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eShuffle); -#ifdef __APPLE__ - if (device->vendor_id == VK_VENDOR_ID_AMD) { - device->subgroup_shuffle = false; - } -#endif - device->subgroup_clustered = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && - (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eClustered); - - device->subgroup_ballot = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && - (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eBallot); - - device->subgroup_vote = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && - (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eVote); - - const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr; - - device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; - - if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props, device->architecture)) { - device->coopmat_support = false; - } - - device->integer_dot_product = device->integer_dot_product && shader_integer_dot_product_props.integerDotProduct4x8BitPackedSignedAccelerated; - - device->min_imported_host_pointer_alignment = external_memory_host_props.minImportedHostPointerAlignment; - - device->max_workgroup_size_log2 = uint32_t(log2f(float(device->properties.limits.maxComputeWorkGroupInvocations))); - - std::vector queue_family_props = device->physical_device.getQueueFamilyProperties(); - - // Try to find a non-graphics compute queue and transfer-focused queues - // Allow overriding avoiding the graphics queue because it can increase performance on RADV - const bool allow_graphics_queue = (getenv("GGML_VK_ALLOW_GRAPHICS_QUEUE") != nullptr); - const vk::QueueFlagBits graphics_flag = allow_graphics_queue ? (vk::QueueFlagBits)0 : vk::QueueFlagBits::eGraphics; - const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, graphics_flag, -1, 1); - const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | graphics_flag, compute_queue_family_index, 1); - - const float priorities[] = { 1.0f, 1.0f }; - device->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1; - - std::vector device_queue_create_infos; - if (compute_queue_family_index != transfer_queue_family_index) { - device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); - device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, priorities + 1}); - } else if(!device->single_queue) { - device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities}); - } else { - device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); - } - vk::DeviceCreateInfo device_create_info{}; - std::vector device_extensions; - vk::PhysicalDeviceFeatures device_features = device->physical_device.getFeatures(); - - VkPhysicalDeviceFeatures2 device_features2; - device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; - device_features2.pNext = nullptr; - device_features2.features = (VkPhysicalDeviceFeatures)device_features; - - VkPhysicalDeviceVulkan11Features vk11_features; - vk11_features.pNext = nullptr; - vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; - device_features2.pNext = &vk11_features; - - VkPhysicalDeviceVulkan12Features vk12_features; - vk12_features.pNext = nullptr; - vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; - vk11_features.pNext = &vk12_features; - - last_struct = (VkBaseOutStructure *)&vk12_features; - - VkPhysicalDevicePipelineRobustnessFeaturesEXT pl_robustness_features; - pl_robustness_features.pNext = nullptr; - pl_robustness_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_FEATURES_EXT; - pl_robustness_features.pipelineRobustness = VK_FALSE; - - if (pipeline_robustness) { - last_struct->pNext = (VkBaseOutStructure *)&pl_robustness_features; - last_struct = (VkBaseOutStructure *)&pl_robustness_features; - device_extensions.push_back("VK_EXT_pipeline_robustness"); - } - - VkPhysicalDeviceMemoryPriorityFeaturesEXT memory_priority_features; - memory_priority_features.pNext = nullptr; - memory_priority_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PRIORITY_FEATURES_EXT; - memory_priority_features.memoryPriority = VK_FALSE; - if (device->memory_priority) { - last_struct->pNext = (VkBaseOutStructure *)&memory_priority_features; - last_struct = (VkBaseOutStructure *)&memory_priority_features; - device_extensions.push_back("VK_EXT_memory_priority"); - } - - VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features; - subgroup_size_control_features.pNext = nullptr; - subgroup_size_control_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT; - subgroup_size_control_features.computeFullSubgroups = false; - subgroup_size_control_features.subgroupSizeControl = false; - - if (device->subgroup_size_control) { - last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_features; - last_struct = (VkBaseOutStructure *)&subgroup_size_control_features; - } - -#if defined(VK_KHR_cooperative_matrix) - VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features; - coopmat_features.pNext = nullptr; - coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; - coopmat_features.cooperativeMatrix = VK_FALSE; - - if (device->coopmat_support) { - last_struct->pNext = (VkBaseOutStructure *)&coopmat_features; - last_struct = (VkBaseOutStructure *)&coopmat_features; - } -#endif - -#if defined(VK_NV_cooperative_matrix2) - VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {}; - coopmat2_features.pNext = nullptr; - coopmat2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV; - if (coopmat2_support) { - last_struct->pNext = (VkBaseOutStructure *)&coopmat2_features; - last_struct = (VkBaseOutStructure *)&coopmat2_features; - device_extensions.push_back("VK_NV_cooperative_matrix2"); - } -#endif - - VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {}; - coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV; - if (coopmat2_decode_vector_support) { - last_struct->pNext = (VkBaseOutStructure *)&coopmat2_decode_vector_features; - last_struct = (VkBaseOutStructure *)&coopmat2_decode_vector_features; - device_extensions.push_back(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME); - } - -#if defined(VK_KHR_shader_bfloat16) - VkPhysicalDeviceShaderBfloat16FeaturesKHR bfloat16_features {}; - bfloat16_features.pNext = nullptr; - bfloat16_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR; - if (bfloat16_support) { - last_struct->pNext = (VkBaseOutStructure *)&bfloat16_features; - last_struct = (VkBaseOutStructure *)&bfloat16_features; - device_extensions.push_back("VK_KHR_shader_bfloat16"); - } -#endif - - VkPhysicalDeviceMaintenance4Features maint4_features {}; - maint4_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES; - if (maintenance4_support) { - last_struct->pNext = (VkBaseOutStructure *)&maint4_features; - last_struct = (VkBaseOutStructure *)&maint4_features; - device_extensions.push_back("VK_KHR_maintenance4"); - } - - VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR shader_integer_dot_product_features {}; - shader_integer_dot_product_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR; - if (device->integer_dot_product) { - last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_features; - last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_features; - device_extensions.push_back("VK_KHR_shader_integer_dot_product"); - } - - VkPhysicalDeviceShaderMixedFloatDotProductFeaturesVALVE dot2_features {}; - dot2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MIXED_FLOAT_DOT_PRODUCT_FEATURES_VALVE; - if (dot2_f16_support) { - last_struct->pNext = (VkBaseOutStructure *)&dot2_features; - last_struct = (VkBaseOutStructure *)&dot2_features; - device_extensions.push_back("VK_VALVE_shader_mixed_float_dot_product"); - } - - VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR pep_features {}; - pep_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR; - if (pipeline_executable_properties_support) { - last_struct->pNext = (VkBaseOutStructure *)&pep_features; - last_struct = (VkBaseOutStructure *)&pep_features; - device_extensions.push_back("VK_KHR_pipeline_executable_properties"); - } - - if (device->external_memory_host) { - device_extensions.push_back("VK_EXT_external_memory_host"); - } - -#if defined(VK_EXT_shader_64bit_indexing) - VkPhysicalDeviceShader64BitIndexingFeaturesEXT shader_64bit_indexing_features {}; - shader_64bit_indexing_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_64_BIT_INDEXING_FEATURES_EXT; - if (device->shader_64b_indexing) { - last_struct->pNext = (VkBaseOutStructure *)&shader_64bit_indexing_features; - last_struct = (VkBaseOutStructure *)&shader_64bit_indexing_features; - device_extensions.push_back("VK_EXT_shader_64bit_indexing"); - } -#endif - - vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2); - - device->pipeline_executable_properties_support = pipeline_executable_properties_support; - - device->fp16 = device->fp16 && vk12_features.shaderFloat16; - -#if defined(VK_KHR_shader_bfloat16) - device->bf16 = bfloat16_support && bfloat16_features.shaderBFloat16Type; -#else - device->bf16 = false; -#endif - - device->dot2_f16 = dot2_f16_support && dot2_features.shaderMixedFloatDotProductFloat16AccFloat32; - - device->pipeline_robustness = pl_robustness_features.pipelineRobustness; - - device->multi_add = vk12_props.shaderRoundingModeRTEFloat16 && - device->properties.limits.maxPushConstantsSize >= sizeof(vk_op_multi_add_push_constants) && - getenv("GGML_VK_DISABLE_MULTI_ADD") == nullptr; - - device->shader_int64 = device_features2.features.shaderInt64; - device->buffer_device_address = vk12_features.bufferDeviceAddress; - device->vulkan_memory_model = vk12_features.vulkanMemoryModel; - - if (device->subgroup_size_control) { - device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize; - device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize; - device_extensions.push_back("VK_EXT_subgroup_size_control"); - } - - device->subgroup_size_control = device->subgroup_size_control && - (subgroup_size_control_props.requiredSubgroupSizeStages & vk::ShaderStageFlagBits::eCompute) && - subgroup_size_control_features.subgroupSizeControl; - - device->subgroup_require_full_support = subgroup_size_control_features.computeFullSubgroups; - -#if defined(VK_KHR_cooperative_matrix) - device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix; - device->coopmat1_fa_support = device->coopmat_support && device->subgroup_require_full_support; -#endif - - if (coopmat2_support) { -#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) - if (coopmat2_features.cooperativeMatrixWorkgroupScope && - coopmat2_features.cooperativeMatrixFlexibleDimensions && - coopmat2_features.cooperativeMatrixReductions && - coopmat2_features.cooperativeMatrixConversions && - coopmat2_features.cooperativeMatrixPerElementOperations && - coopmat2_features.cooperativeMatrixTensorAddressing && - coopmat2_features.cooperativeMatrixBlockLoads && - vk12_features.bufferDeviceAddress) { - - std::vector flexible_dimensions; - uint32_t count = 0; - - PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV - _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV = - (PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV) - vk_instance.instance.getProcAddr("vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV"); - - _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(device->physical_device, &count, nullptr); - - VkCooperativeMatrixFlexibleDimensionsPropertiesNV empty_prop {}; - empty_prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_FLEXIBLE_DIMENSIONS_PROPERTIES_NV; - flexible_dimensions.resize(count, empty_prop); - - _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(device->physical_device, &count, flexible_dimensions.data()); - - bool found_fp16_128 = false, - found_fp16_256 = false, - found_fp32_128 = false, - found_fp32_256 = false; - bool found_bf16_128 = false, - found_bf16_256 = false; - // need to support fp16*fp16 with fp16/fp32 accumulator, for workgroupsize 128 - // with 32x16x16 and 256 with 32x32x16. - for (auto &prop : flexible_dimensions) { - if (prop.saturatingAccumulation == VK_FALSE && - prop.scope == VK_SCOPE_WORKGROUP_KHR) { - - if (prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && - prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) { - - if (prop.workgroupInvocations == 128 && - prop.MGranularity <= 32 && - prop.NGranularity <= 16 && - prop.KGranularity <= 16) { - if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && - prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { - found_fp16_128 = true; - } - if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && - prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { - found_fp32_128 = true; - } - } - if (prop.workgroupInvocations == 256 && - prop.MGranularity <= 32 && - prop.NGranularity <= 32 && - prop.KGranularity <= 16) { - if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && - prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { - found_fp16_256 = true; - } - if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && - prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { - found_fp32_256 = true; - } - } - } - -#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) - if (prop.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && - prop.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR && - prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && - prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { - - if (prop.workgroupInvocations == 128 && - prop.MGranularity <= 32 && - prop.NGranularity <= 16 && - prop.KGranularity <= 16) { - found_bf16_128 = true; - } - if (prop.workgroupInvocations == 256 && - prop.MGranularity <= 32 && - prop.NGranularity <= 32 && - prop.KGranularity <= 16) { - found_bf16_256 = true; - } - } -#endif - } - } - if (found_fp16_128 && found_fp16_256 && - found_fp32_128 && found_fp32_256 && - coopmat2_props.cooperativeMatrixFlexibleDimensionsMaxDimension >= 512) { - device->coopmat2 = true; - device->coopmat2_bf16_support = found_bf16_128 && found_bf16_256; - device->coopmat2_decode_vector = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector; - } - } -#endif - } - - if (!vk11_features.storageBuffer16BitAccess) { - std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl; - throw std::runtime_error("Unsupported device"); - } - - device_extensions.push_back("VK_KHR_16bit_storage"); - -#ifdef GGML_VULKAN_VALIDATE - device_extensions.push_back("VK_KHR_shader_non_semantic_info"); -#endif - - if (device->fp16) { - device_extensions.push_back("VK_KHR_shader_float16_int8"); - } - -#if defined(VK_KHR_cooperative_matrix) - if (device->coopmat_support) { - // Query supported shapes - std::vector cm_props; - - PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = - (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)vkGetInstanceProcAddr(vk_instance.instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR"); - - uint32_t cm_props_num; - - pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(device->physical_device, &cm_props_num, nullptr); - - cm_props.resize(cm_props_num); - - for (auto& prop : cm_props) { - prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR; - } - - pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(device->physical_device, &cm_props_num, cm_props.data()); - - VK_LOG_DEBUG("ggml_vulkan: Cooperative Matrix Shapes: " << cm_props.size()); - - for (auto& prop : cm_props) { - VK_LOG_DEBUG("ggml_vulkan: M: " << prop.MSize << " N: " << prop.NSize << " K: " << prop.KSize << " A: " << vk::to_string((vk::ComponentTypeKHR)prop.AType) << " B: " << vk::to_string((vk::ComponentTypeKHR)prop.BType) << " C: " << vk::to_string((vk::ComponentTypeKHR)prop.CType) << " Result: " << vk::to_string((vk::ComponentTypeKHR)prop.ResultType) << " saturatingAccumulation: " << prop.saturatingAccumulation << " scope: " << vk::to_string((vk::ScopeKHR)prop.scope)); - - if ((vk::ComponentTypeKHR)prop.AType == vk::ComponentTypeKHR::eFloat16 && - (vk::ComponentTypeKHR)prop.BType == vk::ComponentTypeKHR::eFloat16 && - (vk::ScopeKHR)prop.scope == vk::ScopeKHR::eSubgroup - ) { - if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat32 && - (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat32) { - // coopmat sizes not set yet - if (device->coopmat_m == 0) { - device->coopmat_acc_f32_support = true; - device->coopmat_m = prop.MSize; - device->coopmat_n = prop.NSize; - device->coopmat_k = prop.KSize; - } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) { - // Only enable if shape is identical - device->coopmat_acc_f32_support = true; - } - if (prop.MSize == 16 && prop.NSize == 16 && prop.KSize == 16) { - device->coopmat_support_16x16x16_f32acc = true; - } - } else if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat16 && - (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat16) { - // coopmat sizes not set yet - if (device->coopmat_m == 0) { - device->coopmat_acc_f16_support = true; - device->coopmat_m = prop.MSize; - device->coopmat_n = prop.NSize; - device->coopmat_k = prop.KSize; - } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) { - // Only enable if shape is identical - device->coopmat_acc_f16_support = true; - } - if (prop.MSize == 16 && prop.NSize == 16 && prop.KSize == 16) { - device->coopmat_support_16x16x16_f16acc = true; - } - } - } else if ((vk::ComponentTypeKHR)prop.AType == vk::ComponentTypeKHR::eSint8 && - (vk::ComponentTypeKHR)prop.BType == vk::ComponentTypeKHR::eSint8 && - (vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eSint32 && - (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eSint32 && - (vk::ScopeKHR)prop.scope == vk::ScopeKHR::eSubgroup && - device->coopmat_int_m == 0 - ) { - device->coopmat_int_support = true; - device->coopmat_int_m = prop.MSize; - device->coopmat_int_n = prop.NSize; - device->coopmat_int_k = prop.KSize; - } -#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) - if (prop.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && - prop.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR && - prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && - prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR && - (vk::ScopeKHR)prop.scope == vk::ScopeKHR::eSubgroup - ) { - // coopmat sizes not set yet - if (device->coopmat_m == 0) { - device->coopmat_bf16_support = true; - device->coopmat_m = prop.MSize; - device->coopmat_n = prop.NSize; - device->coopmat_k = prop.KSize; - } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) { - // Only enable if shape is identical - device->coopmat_bf16_support = true; - } - } -#endif - } - - if (device->coopmat_m == 0 || !device->coopmat_acc_f32_support) { - // No suitable matmul mode found - GGML_LOG_DEBUG("ggml_vulkan: WARNING: No suitable matrix core mode found. Disabling matrix cores.\n"); - device->coopmat_support = false; - } - if (getenv("GGML_VK_DISABLE_BFLOAT16")) { - device->coopmat_bf16_support = false; - } - } - - if (device->coopmat_support) { - device_extensions.push_back("VK_KHR_cooperative_matrix"); - } -#if defined(VK_KHR_shader_bfloat16) - if (device->coopmat_bf16_support) { - device_extensions.push_back("VK_KHR_shader_bfloat16"); - } -#endif -#endif - device->name = GGML_VK_NAME + std::to_string(idx); - - device_create_info - .setFlags(vk::DeviceCreateFlags()) - .setQueueCreateInfos(device_queue_create_infos) - .setPEnabledExtensionNames(device_extensions); - device_create_info.setPNext(&device_features2); - device->device = device->physical_device.createDevice(device_create_info); - - // Queues - ggml_vk_create_queue(device, device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }, false); - - // Shaders - // Disable matmul tile sizes early if performance low or not supported - for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) { - switch (device->vendor_id) { -#ifndef GGML_VULKAN_RUN_TESTS - case VK_VENDOR_ID_AMD: - device->mul_mat_l[i] = device->coopmat_support && device->driver_id != vk::DriverId::eAmdProprietary; - device->mul_mat_m[i] = true; - device->mul_mat_s[i] = true; - device->mul_mat_id_l[i] = false; - device->mul_mat_id_m[i] = true; - device->mul_mat_id_s[i] = true; - break; - case VK_VENDOR_ID_INTEL: { - // Current Windows driver does not expose BF16 support. - // We only want to use l_warptile if coopmat is available and is Xe2+ - const bool xe2_with_coopmat = device->coopmat_support && device->architecture == INTEL_XE2; - const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && xe2_with_coopmat) : xe2_with_coopmat; - device->mul_mat_l[i] = use_l_warptile; - device->mul_mat_id_l[i] = use_l_warptile; - device->mul_mat_m[i] = true; - device->mul_mat_s[i] = true; - device->mul_mat_id_m[i] = true; - device->mul_mat_id_s[i] = true; - break; - } - case VK_VENDOR_ID_APPLE: - device->mul_mat_l[i] = false; - device->mul_mat_m[i] = true; - device->mul_mat_s[i] = false; - device->mul_mat_id_l[i] = false; - device->mul_mat_id_m[i] = true; - device->mul_mat_id_s[i] = false; - break; -#endif - default: - device->mul_mat_l[i] = true; - device->mul_mat_m[i] = true; - device->mul_mat_s[i] = true; - device->mul_mat_id_l[i] = true; - device->mul_mat_id_m[i] = true; - device->mul_mat_id_s[i] = true; - break; - } - -#if VK_HEADER_VERSION >= 287 - // Honeykrisp driver for Asahi Linux doesn't report VK_VENDOR_ID_APPLE. - // Check for Honeykrisp driver and force same configuration as the VK_VENDOR_ID_APPLE case. - if (device->driver_id == vk::DriverId::eMesaHoneykrisp) { - device->mul_mat_l[i] = false; - device->mul_mat_m[i] = true; - device->mul_mat_s[i] = false; - device->mul_mat_id_l[i] = false; - device->mul_mat_id_m[i] = true; - device->mul_mat_id_s[i] = false; - } -#endif - - device->mul_mat_l_int[i] = device->mul_mat_l[i]; - device->mul_mat_m_int[i] = device->mul_mat_m[i]; - device->mul_mat_s_int[i] = device->mul_mat_s[i]; - device->mul_mat_id_l_int[i] = device->mul_mat_id_l[i]; - device->mul_mat_id_m_int[i] = device->mul_mat_id_m[i]; - device->mul_mat_id_s_int[i] = device->mul_mat_id_s[i]; - } - - - std::vector dsl_binding; - std::vector dsl_binding_flags; - for (uint32_t i = 0; i < MAX_PARAMETER_COUNT; i++) { - dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute}); - dsl_binding_flags.push_back({}); - } - - vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags }; - - vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info( - {}, - dsl_binding); - descriptor_set_layout_create_info.setPNext(&dslbfci); - device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info); - - ggml_vk_load_shaders(device); - - // Prefer a dedicated transfer queue on AMD dGPUs (non-GCN) when graphics queue use is disabled. - const bool prefers_transfer_queue = - device->vendor_id == VK_VENDOR_ID_AMD && - device->architecture != AMD_GCN && - !device->uma && - !allow_graphics_queue; - - if (!device->single_queue) { - const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; - ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true); - - device->async_use_transfer_queue = prefers_transfer_queue || (getenv("GGML_VK_ASYNC_USE_TRANSFER_QUEUE") != nullptr); - } else { - // TODO: Use pointer or reference to avoid copy - device->transfer_queue.copyFrom(device->compute_queue); - device->transfer_queue.cmd_pool.init(device, &device->transfer_queue); - - device->async_use_transfer_queue = false; - } - - device->buffer_type = { - /* .iface = */ ggml_backend_vk_buffer_type_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), idx), - /* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device }, - }; - - device->fence = device->device.createFence({}); - - device->idx = idx; - - device->disable_fusion = getenv("GGML_VK_DISABLE_FUSION") != nullptr; - - device->add_rms_fusion = !device->disable_fusion && - device->subgroup_arithmetic && - device->vendor_id != VK_VENDOR_ID_INTEL; - device->partials_binding_alignment = - std::max(4u, (uint32_t)device->properties.limits.minStorageBufferOffsetAlignment); - - device->mmvq_mode = 0; - if (getenv("GGML_VK_DISABLE_MMVQ")) { - device->mmvq_mode = -1; - } else if (getenv("GGML_VK_FORCE_MMVQ")) { - device->mmvq_mode = 1; - } - - return device; - } - - return vk_instance.devices[idx]; -} - -static void ggml_vk_print_gpu_info(size_t idx) { - GGML_ASSERT(idx < vk_instance.device_indices.size()); - size_t dev_num = vk_instance.device_indices[idx]; - VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")"); - GGML_ASSERT(vk_instance_initialized); - - std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); - - if (dev_num >= devices.size()) { - std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl; - throw std::runtime_error("Device not found"); - } - - vk::PhysicalDevice physical_device = devices[dev_num]; - std::vector ext_props = physical_device.enumerateDeviceExtensionProperties(); - - bool fp16_storage = false; - bool fp16_compute = false; - bool coopmat_support = false; - bool coopmat2_support = false; - bool coopmat2_decode_vector_support = false; - bool integer_dot_product = false; - bool bfloat16_support = false; - bool dot2_f16_support = false; - - for (auto properties : ext_props) { - if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { - fp16_storage = true; - } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { - fp16_compute = true; -#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) - } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && - !getenv("GGML_VK_DISABLE_COOPMAT")) { - coopmat_support = true; -#endif -#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) - } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && - !getenv("GGML_VK_DISABLE_COOPMAT2")) { - coopmat2_support = true; -#endif - } else if (strcmp(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME, properties.extensionName) == 0 && - !getenv("GGML_VK_DISABLE_COOPMAT2_DECODE_VECTOR")) { - coopmat2_decode_vector_support = true; -#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) - } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 && - !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) { - integer_dot_product = true; -#endif -#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) - } else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 && - !getenv("GGML_VK_DISABLE_BFLOAT16")) { - bfloat16_support = true; -#endif - } else if (strcmp("VK_VALVE_shader_mixed_float_dot_product", properties.extensionName) == 0 && - !getenv("GGML_VK_DISABLE_DOT2")) { - dot2_f16_support = true; - } - } - - const vk_device_architecture device_architecture = get_device_architecture(physical_device); - - const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16"); - bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr; - - bool fp16 = !force_disable_f16 && fp16_storage && fp16_compute; - - vk::PhysicalDeviceProperties2 props2; - vk::PhysicalDeviceMaintenance3Properties props3; - vk::PhysicalDeviceSubgroupProperties subgroup_props; - vk::PhysicalDeviceDriverProperties driver_props; - vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR shader_integer_dot_product_props; - props2.pNext = &props3; - props3.pNext = &subgroup_props; - subgroup_props.pNext = &driver_props; - - // Pointer to the last chain element - VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&driver_props; - - if (integer_dot_product) { - last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_props; - last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_props; - } - - physical_device.getProperties2(&props2); - - VkPhysicalDeviceFeatures2 device_features2; - device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; - device_features2.pNext = nullptr; - - VkPhysicalDeviceVulkan11Features vk11_features; - vk11_features.pNext = nullptr; - vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; - device_features2.pNext = &vk11_features; - - VkPhysicalDeviceVulkan12Features vk12_features; - vk12_features.pNext = nullptr; - vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; - vk11_features.pNext = &vk12_features; - - // Pointer to the last chain element - last_struct = (VkBaseOutStructure *)&vk12_features; - -#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) - VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features; - coopmat_features.pNext = nullptr; - coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; - coopmat_features.cooperativeMatrix = VK_FALSE; - - if (coopmat_support) { - last_struct->pNext = (VkBaseOutStructure *)&coopmat_features; - last_struct = (VkBaseOutStructure *)&coopmat_features; - } -#endif - - VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR shader_integer_dot_product_features {}; - shader_integer_dot_product_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR; - if (integer_dot_product) { - last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_features; - last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_features; - } - -#if defined(VK_KHR_shader_bfloat16) - VkPhysicalDeviceShaderBfloat16FeaturesKHR bfloat16_features {}; - bfloat16_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR; - if (bfloat16_support) { - last_struct->pNext = (VkBaseOutStructure *)&bfloat16_features; - last_struct = (VkBaseOutStructure *)&bfloat16_features; - } -#endif - -#if defined(VK_NV_cooperative_matrix2) - VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {}; - coopmat2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV; - if (coopmat2_support) { - last_struct->pNext = (VkBaseOutStructure *)&coopmat2_features; - last_struct = (VkBaseOutStructure *)&coopmat2_features; - } -#endif - - VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {}; - coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV; - if (coopmat2_decode_vector_support) { - last_struct->pNext = (VkBaseOutStructure *)&coopmat2_decode_vector_features; - last_struct = (VkBaseOutStructure *)&coopmat2_decode_vector_features; - } - - VkPhysicalDeviceShaderMixedFloatDotProductFeaturesVALVE dot2_features {}; - dot2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MIXED_FLOAT_DOT_PRODUCT_FEATURES_VALVE; - if (dot2_f16_support) { - last_struct->pNext = (VkBaseOutStructure *)&dot2_features; - last_struct = (VkBaseOutStructure *)&dot2_features; - } - - vkGetPhysicalDeviceFeatures2(physical_device, &device_features2); - - fp16 = fp16 && vk12_features.shaderFloat16; - -#if defined(VK_KHR_shader_bfloat16) - bool bf16 = bfloat16_support && bfloat16_features.shaderBFloat16Type; -#else - bool bf16 = false; -#endif - - uint32_t default_subgroup_size = get_subgroup_size("", device_architecture); - const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize; - const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; - - integer_dot_product = integer_dot_product - && shader_integer_dot_product_props.integerDotProduct4x8BitPackedSignedAccelerated - && shader_integer_dot_product_features.shaderIntegerDotProduct; - - coopmat_support = coopmat_support -#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) - && coopmat_features.cooperativeMatrix -#endif - && ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture); - -#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) - coopmat2_support = coopmat2_support && - coopmat2_features.cooperativeMatrixWorkgroupScope && - coopmat2_features.cooperativeMatrixFlexibleDimensions && - coopmat2_features.cooperativeMatrixReductions && - coopmat2_features.cooperativeMatrixConversions && - coopmat2_features.cooperativeMatrixPerElementOperations && - coopmat2_features.cooperativeMatrixTensorAddressing && - coopmat2_features.cooperativeMatrixBlockLoads; -#else - coopmat2_support = false; -#endif - - coopmat2_decode_vector_support = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector; -#if !defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT) - coopmat2_decode_vector_support = false; -#endif - - std::string matrix_cores = coopmat2_support ? (coopmat2_decode_vector_support ? "NV_coopmat2v" : "NV_coopmat2") - : coopmat_support ? "KHR_coopmat" - : "none"; - - bool dot2_f16 = dot2_f16_support && dot2_features.shaderMixedFloatDotProductFloat16AccFloat32; - const char *fp16_str = fp16 ? (dot2_f16 ? "dot2" : "1") : "0"; - - std::string device_name = props2.properties.deviceName.data(); - GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %s | bf16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n", - idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16_str, bf16, subgroup_size, - props2.properties.limits.maxComputeSharedMemorySize, integer_dot_product, matrix_cores.c_str()); - - if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { - GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n"); - } -} - -static bool ggml_vk_instance_layer_settings_available(); -static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector& instance_extensions); -static bool ggml_vk_instance_debug_utils_ext_available(const std::vector & instance_extensions); -static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev); - -static DispatchLoaderDynamic ggml_vk_default_dispatcher_instance; -DispatchLoaderDynamic & ggml_vk_default_dispatcher() { - return ggml_vk_default_dispatcher_instance; -} - -static void ggml_vk_instance_init() { - if (vk_instance_initialized) { - return; - } - VK_LOG_DEBUG("ggml_vk_instance_init()"); - - // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers- - ggml_vk_default_dispatcher_instance.init(vkGetInstanceProcAddr); - - uint32_t api_version = vk::enumerateInstanceVersion(); - - if (api_version < VK_API_VERSION_1_2) { - std::cerr << "ggml_vulkan: Error: Vulkan 1.2 required." << std::endl; - throw vk::SystemError(vk::Result::eErrorFeatureNotPresent, "Vulkan 1.2 required"); - } - - vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, api_version }; - - const std::vector instance_extensions = vk::enumerateInstanceExtensionProperties(); - const bool layer_settings = ggml_vk_instance_layer_settings_available(); -#ifdef __APPLE__ - const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions); -#endif - const bool debug_utils_ext = ggml_vk_instance_debug_utils_ext_available(instance_extensions) && getenv("GGML_VK_DEBUG_MARKERS") != nullptr; - std::vector layers; - - if (layer_settings) { - layers.push_back("VK_LAYER_KHRONOS_validation"); - } - std::vector extensions; - if (layer_settings) { - extensions.push_back("VK_EXT_layer_settings"); - } -#ifdef __APPLE__ - if (portability_enumeration_ext) { - extensions.push_back("VK_KHR_portability_enumeration"); - } -#endif - if (debug_utils_ext) { - extensions.push_back("VK_EXT_debug_utils"); - } - VkBool32 enable_best_practice = layer_settings; - std::vector settings = { - { - "VK_LAYER_KHRONOS_validation", - "validate_best_practices", - vk::LayerSettingTypeEXT::eBool32, - 1, - &enable_best_practice - }, - }; - vk::LayerSettingsCreateInfoEXT layer_setting_info(settings); - vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions, &layer_setting_info); -#ifdef __APPLE__ - if (portability_enumeration_ext) { - instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR; - } -#endif - - vk_instance.instance = vk::createInstance(instance_create_info); - vk_instance_initialized = true; - - if (debug_utils_ext) { - vk_instance.debug_utils_support = true; - vk_instance.pfn_vkSetDebugUtilsObjectNameEXT = (PFN_vkSetDebugUtilsObjectNameEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkSetDebugUtilsObjectNameEXT"); - vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT = (PFN_vkQueueBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueBeginDebugUtilsLabelEXT"); - vk_instance.pfn_vkQueueEndDebugUtilsLabelEXT = (PFN_vkQueueEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueEndDebugUtilsLabelEXT"); - vk_instance.pfn_vkCmdBeginDebugUtilsLabelEXT = (PFN_vkCmdBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdBeginDebugUtilsLabelEXT"); - vk_instance.pfn_vkCmdEndDebugUtilsLabelEXT = (PFN_vkCmdEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdEndDebugUtilsLabelEXT"); - vk_instance.pfn_vkCmdInsertDebugUtilsLabelEXT = (PFN_vkCmdInsertDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdInsertDebugUtilsLabelEXT"); - } - - vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr; - vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr; - vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr; - vk_memory_logger_enabled = getenv("GGML_VK_MEMORY_LOGGER") != nullptr; - const char* GGML_VK_PIPELINE_STATS = getenv("GGML_VK_PIPELINE_STATS"); - if (GGML_VK_PIPELINE_STATS != nullptr) { - vk_pipeline_stats_filter = GGML_VK_PIPELINE_STATS; - } - const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY"); - - if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) { - vk_perf_logger_frequency = std::stoul(GGML_VK_PERF_LOGGER_FREQUENCY); - } - - // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers- - VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance); - - std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); - - // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan - char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES"); - if (devices_env != nullptr) { - size_t num_available_devices = devices.size(); - - std::string devices(devices_env); - std::replace(devices.begin(), devices.end(), ',', ' '); - - std::stringstream ss(devices); - size_t tmp; - while (ss >> tmp) { - if(tmp >= num_available_devices) { - std::cerr << "ggml_vulkan: Invalid device index " << tmp << " in GGML_VK_VISIBLE_DEVICES." << std::endl; - throw std::runtime_error("Invalid Vulkan device index"); - } - vk_instance.device_indices.push_back(tmp); - } - } else { - // If no vulkan devices are found, return early - if (devices.empty()) { - GGML_LOG_INFO("ggml_vulkan: No devices found.\n"); - return; - } - - // Default to using all dedicated GPUs - for (size_t i = 0; i < devices.size(); i++) { - vk::PhysicalDeviceProperties2 new_props; - vk::PhysicalDeviceDriverProperties new_driver; - vk::PhysicalDeviceIDProperties new_id; - new_props.pNext = &new_driver; - new_driver.pNext = &new_id; - devices[i].getProperties2(&new_props); - - if ((new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) && ggml_vk_device_is_supported(devices[i])) { - // Check if there are two physical devices corresponding to the same GPU - // This handles the case where the same GPU appears with different drivers (e.g., RADV + AMDVLK on Linux), - // see https://github.com/ggml-org/llama.cpp/pull/7582 for original deduplication. - // MoltenVK on macOS may report the same UUID for distinct GPUs on multi-GPU cards, - // see https://github.com/KhronosGroup/MoltenVK/issues/2683. Skip when both old/new - // driver is MoltenVK - auto old_device = std::find_if( - vk_instance.device_indices.begin(), - vk_instance.device_indices.end(), - [&devices, &new_id, &new_driver](const size_t k){ - vk::PhysicalDeviceProperties2 old_props; - vk::PhysicalDeviceDriverProperties old_driver; - vk::PhysicalDeviceIDProperties old_id; - old_props.pNext = &old_driver; - old_driver.pNext = &old_id; - devices[k].getProperties2(&old_props); - - bool same_uuid = std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID)); - same_uuid = same_uuid || ( - old_id.deviceLUIDValid && new_id.deviceLUIDValid && - std::equal(std::begin(old_id.deviceLUID), std::end(old_id.deviceLUID), std::begin(new_id.deviceLUID)) - ); - bool both_molten_vk = (new_driver.driverID == vk::DriverId::eMoltenvk && old_driver.driverID == vk::DriverId::eMoltenvk); - - return same_uuid && !both_molten_vk; - } - ); - if (old_device == vk_instance.device_indices.end()) { - vk_instance.device_indices.push_back(i); - } else { - // There can be two physical devices corresponding to the same GPU if there are 2 different drivers - // This can cause error when splitting layers aross the devices, need to keep only 1 - VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same deviceUUID"); - - vk::PhysicalDeviceProperties2 old_props; - vk::PhysicalDeviceDriverProperties old_driver; - old_props.pNext = &old_driver; - devices[*old_device].getProperties2(&old_props); - - std::map driver_priorities {}; - int old_priority = std::numeric_limits::max(); - int new_priority = std::numeric_limits::max(); - - // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id - // Smaller number -> higher priority - switch (old_props.properties.vendorID) { - case VK_VENDOR_ID_AMD: - driver_priorities[vk::DriverId::eMesaRadv] = 1; - driver_priorities[vk::DriverId::eAmdOpenSource] = 2; - driver_priorities[vk::DriverId::eAmdProprietary] = 3; - break; - case VK_VENDOR_ID_INTEL: - driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1; - driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2; - break; - case VK_VENDOR_ID_NVIDIA: - driver_priorities[vk::DriverId::eNvidiaProprietary] = 1; -#if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235 - driver_priorities[vk::DriverId::eMesaNvk] = 2; -#endif - break; - case VK_VENDOR_ID_QUALCOMM: - driver_priorities[vk::DriverId::eQualcommProprietary] = 1; - driver_priorities[vk::DriverId::eMesaTurnip] = 2; - break; - } - driver_priorities[vk::DriverId::eMesaDozen] = 100; - - if (driver_priorities.count(old_driver.driverID)) { - old_priority = driver_priorities[old_driver.driverID]; - } - if (driver_priorities.count(new_driver.driverID)) { - new_priority = driver_priorities[new_driver.driverID]; - } - - if (new_priority < old_priority) { - auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device); - vk_instance.device_indices.erase(r, vk_instance.device_indices.end()); - vk_instance.device_indices.push_back(i); - - VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName); - } - else { - VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl); - } - } - } - } - - // If no GPUs found, fall back to the first non-CPU device. - // If only CPU devices are available, return without devices. - if (vk_instance.device_indices.empty()) { - for (size_t i = 0; i < devices.size(); i++) { - if (devices[i].getProperties().deviceType != vk::PhysicalDeviceType::eCpu) { - vk_instance.device_indices.push_back(i); - break; - } - } - } - - if (vk_instance.device_indices.empty()) { - GGML_LOG_INFO("ggml_vulkan: No devices found.\n"); - return; - } - } - GGML_LOG_DEBUG("ggml_vulkan: Found %zu Vulkan devices:\n", vk_instance.device_indices.size()); - - for (size_t i = 0; i < vk_instance.device_indices.size(); i++) { - vk::PhysicalDevice vkdev = devices[vk_instance.device_indices[i]]; - std::vector extensionprops = vkdev.enumerateDeviceExtensionProperties(); - - bool membudget_supported = false; - for (const auto & ext : extensionprops) { - if (strcmp(VK_EXT_MEMORY_BUDGET_EXTENSION_NAME, ext.extensionName) == 0) { - membudget_supported = true; - break; - } - } - - vk_instance.device_supports_membudget.push_back(membudget_supported); - - ggml_vk_print_gpu_info(i); - } -} - -static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { - VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")"); - ggml_vk_instance_init(); - GGML_ASSERT(idx < vk_instance.device_indices.size()); - - ctx->name = GGML_VK_NAME + std::to_string(idx); - - ctx->device = ggml_vk_get_device(idx); - - ctx->semaphore_idx = 0; - ctx->event_idx = 0; - - ctx->prealloc_size_x = 0; - ctx->prealloc_size_y = 0; - ctx->prealloc_size_split_k = 0; - // Fixed size of 1KB, for deterministic behavior - ctx->prealloc_size_add_rms_partials = 1024; - - ctx->fence = ctx->device->device.createFence({}); - ctx->almost_ready_fence = ctx->device->device.createFence({}); - - ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue); - if (ctx->device->async_use_transfer_queue) { - vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; - vk::SemaphoreCreateInfo ci{}; - ci.setPNext(&tci); - ctx->transfer_semaphore.s = ctx->device->device.createSemaphore(ci); - ctx->transfer_semaphore.value = 0; - - ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue); - } - - if (vk_perf_logger_enabled) { - ctx->perf_logger = std::unique_ptr(new vk_perf_logger()); - } - -#ifdef GGML_VULKAN_CHECK_RESULTS - const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS"); - vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks)); - const char* output_tensor = getenv("GGML_VULKAN_OUTPUT_TENSOR"); - vk_output_tensor = (output_tensor == NULL ? 0 : atoi(output_tensor)); -#endif -} - -static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) { - VK_LOG_DEBUG("ggml_vk_get_to_fp16()"); - switch (type) { - case GGML_TYPE_F32: - case GGML_TYPE_Q1_0: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_MXFP4: - case GGML_TYPE_NVFP4: - break; - default: - return nullptr; - } - - return ctx->device->pipeline_dequant[type]; -} - -static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) { - VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ", " << prec << ")"); - if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { - return ctx->device->pipeline_matmul_f32; - } - if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { - return ctx->device->pipeline_matmul_f32_f16; - } - if (src0_type == GGML_TYPE_BF16 && src1_type == GGML_TYPE_BF16) { - return ctx->device->pipeline_matmul_bf16; - } - if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) { - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { - return ctx->device->pipeline_matmul_f16_f32.f16acc; - } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { - return ctx->device->pipeline_matmul_f16.f16acc; - } - } else { - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { - return ctx->device->pipeline_matmul_f16_f32.f32acc; - } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { - return ctx->device->pipeline_matmul_f16.f32acc; - } - } - - // MMQ - if (src1_type == GGML_TYPE_Q8_1) { - vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f32acc; - - if (pipelines->is_empty()) { - return nullptr; - } - - return pipelines; - } - - if (src1_type != GGML_TYPE_F32 && !ctx->device->coopmat2) { - return nullptr; - } - - switch (src0_type) { - case GGML_TYPE_Q1_0: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_MXFP4: - case GGML_TYPE_NVFP4: - break; - default: - return nullptr; - } - - if (ctx->device->coopmat2) { - assert(src1_type == GGML_TYPE_F16); - return prec == GGML_PREC_DEFAULT ? ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f32acc; - } - if (ctx->device->coopmat_support) { - return (ctx->device->fp16 && ctx->device->coopmat_acc_f16_support && prec == GGML_PREC_DEFAULT) ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc; - } - return (ctx->device->fp16 && prec == GGML_PREC_DEFAULT) ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc; -} - -static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t num_cols, uint32_t m, uint32_t k) { - VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()"); - GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16 || b_type == GGML_TYPE_Q8_1); - GGML_ASSERT(num_cols >= 1 && num_cols <= mul_mat_vec_max_cols); - - if (b_type == GGML_TYPE_Q8_1) { - switch (a_type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_MXFP4: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - break; - default: - return nullptr; - } - } - - switch (a_type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_Q1_0: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_MXFP4: - case GGML_TYPE_NVFP4: - break; - default: - return nullptr; - } - - // heuristic to choose workgroup size - uint32_t dmmv_wg = DMMV_WG_SIZE_SUBGROUP; - if ((ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA && ctx->device->architecture != vk_device_architecture::NVIDIA_PRE_TURING) || ctx->device->vendor_id == VK_VENDOR_ID_INTEL) { - // Prefer larger workgroups when M is small, to spread the work out more - // and keep more SMs busy. - // q6_k seems to prefer small workgroup size even for "medium" values of M. - if (a_type == GGML_TYPE_Q6_K) { - if (m < 4096 && k >= 1024) { - dmmv_wg = DMMV_WG_SIZE_LARGE; - } - } else { - if (m <= 8192 && k >= 1024) { - dmmv_wg = DMMV_WG_SIZE_LARGE; - } - } - } - - if (b_type == GGML_TYPE_Q8_1) { - if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL) { - dmmv_wg = DMMV_WG_SIZE_SUBGROUP; - } - return ctx->device->pipeline_dequant_mul_mat_vec_q8_1_f32[dmmv_wg][a_type][num_cols-1]; - } - - return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[dmmv_wg][a_type][num_cols-1] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[dmmv_wg][a_type][num_cols-1]; -} - -static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) { - VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()"); - if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { - return ctx->device->pipeline_matmul_id_f32; - } - if (src0_type == GGML_TYPE_BF16 && src1_type == GGML_TYPE_BF16) { - return ctx->device->pipeline_matmul_id_bf16; - } - if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) { - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { - return ctx->device->pipeline_matmul_id_f16_f32.f16acc; - } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { - return ctx->device->pipeline_matmul_id_f16.f16acc; - } - } else { - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { - return ctx->device->pipeline_matmul_id_f16_f32.f32acc; - } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { - return ctx->device->pipeline_matmul_id_f16.f32acc; - } - } - - // MMQ - if (src1_type == GGML_TYPE_Q8_1) { - vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_id_q8_1[src0_type].f32acc; - - if (pipelines->is_empty()) { - return nullptr; - } - - return pipelines; - } - - GGML_ASSERT(src1_type == GGML_TYPE_F32 || (ctx->device->coopmat2 && src1_type == GGML_TYPE_F16)); - - switch (src0_type) { - case GGML_TYPE_Q1_0: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_MXFP4: - case GGML_TYPE_NVFP4: - break; - default: - return nullptr; - } - - vk_matmul_pipeline2& mmp = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type]; - // XXX TODO 'prec' is not actually allowed in mul_mat_id. - bool prefer_fp16acc = ctx->device->fp16 /*&& prec == GGML_PREC_DEFAULT*/; - bool support_fp16acc = !mmp.f16acc->is_empty(); - bool support_fp32acc = !mmp.f32acc->is_empty(); - - if (support_fp16acc && (prefer_fp16acc || !support_fp32acc)) { - return mmp.f16acc; - } else { - GGML_ASSERT(support_fp32acc); - return mmp.f32acc; - } -} - -static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t m, uint32_t k) { - VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec_id()"); - GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_Q8_1); - - if (b_type == GGML_TYPE_Q8_1) { - switch (a_type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_MXFP4: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - break; - default: - return nullptr; - } - } - - switch (a_type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_Q1_0: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_MXFP4: - case GGML_TYPE_NVFP4: - break; - default: - return nullptr; - } - - // heuristic to choose workgroup size - uint32_t dmmv_wg = DMMV_WG_SIZE_SUBGROUP; - if ((ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA && ctx->device->architecture != vk_device_architecture::NVIDIA_PRE_TURING) || ctx->device->vendor_id == VK_VENDOR_ID_INTEL) { - // Prefer larger workgroups when M is small, to spread the work out more - // and keep more SMs busy. - // q6_k seems to prefer small workgroup size even for "medium" values of M. - if (a_type == GGML_TYPE_Q6_K) { - if (m < 4096 && k >= 1024) { - dmmv_wg = DMMV_WG_SIZE_LARGE; - } - } else { - if (m <= 8192 && k >= 1024) { - dmmv_wg = DMMV_WG_SIZE_LARGE; - } - } - } - - if (b_type == GGML_TYPE_Q8_1) { - if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL) { - dmmv_wg = DMMV_WG_SIZE_SUBGROUP; - } - return ctx->device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[dmmv_wg][a_type]; - } - - return ctx->device->pipeline_dequant_mul_mat_vec_id_f32[dmmv_wg][a_type]; -} - -static void * ggml_vk_host_malloc(vk_device& device, size_t size) { - VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")"); - vk_buffer buf = ggml_vk_create_buffer(device, size, - {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); - - if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { - fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n", - size/1024.0/1024.0); - device->device.freeMemory(buf->device_memory); - device->device.destroyBuffer(buf->buffer); - return nullptr; - } - - std::lock_guard guard(device->pinned_memory_mutex); - device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf)); - - return buf->ptr; -} - -static void ggml_vk_host_free(vk_device& device, void* ptr) { - if (ptr == nullptr) { - return; - } - VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")"); - std::lock_guard guard(device->pinned_memory_mutex); - - vk_buffer buf; - size_t index; - for (size_t i = 0; i < device->pinned_memory.size(); i++) { - const uint8_t* addr = (const uint8_t*) std::get<0>(device->pinned_memory[i]); - const uint8_t* endr = addr + std::get<1>(device->pinned_memory[i]); - if (ptr >= addr && ptr < endr) { - buf = std::get<2>(device->pinned_memory[i]); - index = i; - break; - } - } - if (buf == nullptr) { - fprintf(stderr, "WARNING: failed to free pinned memory: memory not in map\n"); - return; - } - - ggml_vk_destroy_buffer(buf); - - device->pinned_memory.erase(device->pinned_memory.begin() + index); -} - -static void ggml_vk_host_get(const vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) { - std::shared_lock guard(device->pinned_memory_mutex); - buf = nullptr; - buf_offset = 0; - for (size_t i = 0; i < device->pinned_memory.size(); i++) { - const uint8_t* addr = (const uint8_t*) std::get<0>(device->pinned_memory[i]); - const uint8_t* endr = addr + std::get<1>(device->pinned_memory[i]); - if (ptr >= addr && ptr < endr) { - buf = std::get<2>(device->pinned_memory[i]); - buf_offset = ((const uint8_t *)ptr) - addr; - break; - } - } -} - -static vk_subbuffer ggml_vk_tensor_subbuffer( - const ggml_backend_vk_context * ctx, const ggml_tensor * tensor, bool allow_misalign = false) { - - vk_buffer buffer = nullptr; - size_t offset = 0; - if (ctx->device->uma) { - ggml_vk_host_get(ctx->device, tensor->data, buffer, offset); - } - if (!buffer) { - auto buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; - buffer = buf_ctx->dev_buffer; - offset = vk_tensor_offset(tensor) + tensor->view_offs; - } - GGML_ASSERT(buffer != nullptr); - - size_t size = ggml_nbytes(tensor); - - size_t misalign_bytes = offset & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1); - // The shader must support misaligned offsets when indexing into the buffer - GGML_ASSERT(allow_misalign || misalign_bytes == 0); - offset &= ~misalign_bytes; - size += misalign_bytes; - - return vk_subbuffer{buffer, offset, size}; -} - -// Get a command buffer from pool. Create a new one if no reusable buffer is available -static vk_command_buffer* ggml_vk_get_or_create_cmd_buffer(vk_device& device, vk_command_pool& pool) { - for (auto& cmd_buffer : pool.cmd_buffers) { - if (!cmd_buffer.in_use) { - cmd_buffer.use_counter++; - cmd_buffer.in_use = true; - return &cmd_buffer; - } - } - return ggml_vk_create_cmd_buffer(device, pool); -} - -static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) { - vk_submission s; - s.buffer = ggml_vk_get_or_create_cmd_buffer(device, p); - if (one_time) { - s.buffer->buf.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); - } else { - s.buffer->buf.begin({ vk::CommandBufferUsageFlags{} }); - } - - return s; -} - -template size_t push_constant_size(const T &t) { - static_assert(std::is_class::value, "T must be a struct/class"); - GGML_UNUSED(t); - return sizeof(T); -} -template size_t push_constant_size(const std::vector &t) { - GGML_UNUSED(t); - return sizeof(T) * t.size(); -} -template size_t push_constant_size(const std::array &t) { - GGML_UNUSED(t); - return sizeof(T) * N; -} - -template const T *push_constant_data(const T &t) { - static_assert(std::is_class::value, "T must be a struct/class"); - return &t; -} -template const T *push_constant_data(const std::vector &t) { - return t.data(); -} -template const T *push_constant_data(const std::array &t) { - return t.data(); -} - -template -static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list const& descriptor_buffer_infos, const T &push_constants, std::array elements) { - const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); - const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); - const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); - VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {"; - for (auto& buffer : descriptor_buffer_infos) { - std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), "; - } - std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"); - GGML_ASSERT(wg0 <= ctx->device->properties.limits.maxComputeWorkGroupCount[0] && - wg1 <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] && - wg2 <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]); - GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size()); - GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT); - GGML_ASSERT(pipeline->parameter_count == descriptor_buffer_infos.size()); - GGML_ASSERT(pipeline->push_constant_size == push_constant_size(push_constants)); - - vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++]; - vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() }; - ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {}); - - subctx->s->buffer->buf.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants)); - subctx->s->buffer->buf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); - subctx->s->buffer->buf.bindDescriptorSets(vk::PipelineBindPoint::eCompute, - pipeline->layout, - 0, - { descriptor_set }, - {}); - subctx->s->buffer->buf.dispatch(wg0, wg1, wg2); -} - -static void ggml_vk_ctx_end(vk_context& ctx) { - VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")"); - if (ctx->s == nullptr) { - return; - } - - ctx->s->buffer->buf.end(); - ctx->s = nullptr; -} - -static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) { - VK_LOG_DEBUG("ggml_vk_ctx_begin(" << device->name << ")"); - if (subctx->s != nullptr) { - ggml_vk_ctx_end(subctx); - } - - subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) }); - subctx->s = subctx->seqs[subctx->seqs.size() - 1].data(); -} - -static vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx) { - vk_context result; - if (!ctx->compute_ctx.expired()) { - result = ctx->compute_ctx.lock(); - } else { - result = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - - ctx->compute_ctx = result; - ggml_vk_ctx_begin(ctx->device, result); - } - - if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) { - result->s->wait_semaphores.push_back(ctx->transfer_semaphore); - ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value; - } - - return result; -} - -// Submit any pending transfer queue work and signal the transfer semaphore. -// The next compute context created via ggml_vk_get_compute_ctx will wait on this semaphore. -// Returns true if work was submitted. -static bool ggml_vk_submit_transfer_ctx(ggml_backend_vk_context * ctx) { - if (!ctx->device->async_use_transfer_queue || ctx->transfer_ctx.expired()) { - return false; - } - - vk_context cpy_ctx = ctx->transfer_ctx.lock(); - ggml_vk_ctx_end(cpy_ctx); - - for (auto& cpy : cpy_ctx->in_memcpys) { - memcpy(cpy.dst, cpy.src, cpy.n); - } - - ctx->transfer_semaphore.value++; - cpy_ctx->seqs.back().back().signal_semaphores.push_back(ctx->transfer_semaphore); - - ggml_vk_submit(cpy_ctx, {}); - ctx->transfer_ctx.reset(); - return true; -} - -static size_t ggml_vk_align_size(size_t width, size_t align) { - VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")"); - return CEIL_DIV(width, align) * align; -} - -static void deferred_memcpy(void * dst, const void * src, size_t size, std::vector* memcpys = nullptr) { - if (memcpys == nullptr) { - memcpy(dst, src, size); - } else { - memcpys->emplace_back(dst, src, size); - } -} - -static void deferred_memset(void * dst, uint32_t val, size_t size, std::vector* memsets = nullptr) { - if (memsets == nullptr) { - memset(dst, val, size); - } else { - memsets->emplace_back(dst, val, size); - } -} - -static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) { - if (device->sync_staging == nullptr || device->sync_staging->size < size) { - VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")"); - ggml_vk_destroy_buffer(device->sync_staging); - device->sync_staging = ggml_vk_create_buffer_check(device, size, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); - } -} - -static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) { - if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) { - VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")"); - ggml_vk_destroy_buffer(ctx->sync_staging); - ctx->sync_staging = ggml_vk_create_buffer_check(ctx->device, size, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); - } -} - -static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { - VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")"); - GGML_ASSERT(!ggml_is_contiguous(tensor)); - // Buffer is already mapped - if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { - std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl; - GGML_ABORT("fatal error"); - } - // Check if src is pinned memory - vk_buffer buf = nullptr; - size_t buf_offset = 0; - ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset); - - const uint64_t ne0 = tensor->ne[0]; - const uint64_t ne1 = tensor->ne[1]; - const uint64_t ne2 = tensor->ne[2]; - const uint64_t ne3 = tensor->ne[3]; - const uint64_t nb0 = tensor->nb[0]; - const uint64_t nb1 = tensor->nb[1]; - const uint64_t nb2 = tensor->nb[2]; - const uint64_t nb3 = tensor->nb[3]; - const ggml_type type = tensor->type; - const uint64_t ts = ggml_type_size(type); - const uint64_t bs = ggml_blck_size(type); - - const uint64_t dstnb0 = ts; - const uint64_t dstnb1 = dstnb0*(ne0/bs); - const uint64_t dstnb2 = dstnb1*ne1; - const uint64_t dstnb3 = dstnb2*ne2; - - const uint64_t ne = ggml_nelements(tensor); - - if (buf != nullptr) { - // Memory is pinned, use as staging buffer - std::vector slices; - - for (uint64_t i3 = 0; i3 < ne3; i3++) { - for (uint64_t i2 = 0; i2 < ne2; i2++) { - // Find longest contiguous slice - if (ne1*nb1 == dstnb2) { - slices.push_back({ buf_offset + i3*nb3 + i2*nb2, offset + i3*dstnb3 + i2*dstnb2, dstnb2 }); - } else { - for (uint64_t i1 = 0; i1 < ne1; i1++) { - if (ne0*nb0/bs == dstnb1) { - slices.push_back({ buf_offset + i3*nb3 + i2*nb2 + i1*nb1, offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, dstnb1 }); - } else { - const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1; - const uint64_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1; - for (uint64_t i0 = 0; i0 < ne0; i0++) { - slices.push_back({ s_off + i0*nb0, d_off + i0*dstnb0, dstnb0 }); - } - } - } - } - } - } - - ggml_vk_sync_buffers(ctx, subctx); - subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices); - return; - } - - if (!sync_staging) { - GGML_ABORT("Asynchronous write to non-pinned memory not supported"); - } - - // Staging buffer required - vk_buffer& staging = ctx->device->sync_staging; - const uint64_t copy_size = ts*ne/bs; - ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size); - VkBufferCopy buf_copy{ 0, offset, copy_size }; - - ggml_vk_sync_buffers(ctx, subctx); - vkCmdCopyBuffer(subctx->s->buffer->buf, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy); - - for (uint64_t i3 = 0; i3 < ne3; i3++) { - for (uint64_t i2 = 0; i2 < ne2; i2++) { - // Find longest contiguous slice - if (ne1*nb1 == dstnb2) { - deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys); - } else { - for (uint64_t i1 = 0; i1 < ne1; i1++) { - if (ne0*nb0/bs == dstnb1) { - deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys); - } else { - const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1; - const uint64_t d_off = i3*dstnb3 + i2*dstnb2 + i1*dstnb1; - for (uint64_t i0 = 0; i0 < ne0; i0++) { - deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys); - } - } - } - } - } - } -} - -static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) { - VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")"); - // Check if src is pinned memory - vk_buffer buf = nullptr; - size_t buf_offset = 0; - ggml_vk_host_get(dst->device, src, buf, buf_offset); - - if (buf != nullptr) { - // Memory is pinned, use as staging buffer - std::vector slices(1); - if (width == spitch && width == dpitch) { - // Only do single write if stride is equal - slices[0].srcOffset = buf_offset; - slices[0].dstOffset = offset; - slices[0].size = width * height; - } else { - slices.resize(height); - for (size_t i = 0; i < height; i++) { - slices[i].srcOffset = buf_offset + i * spitch; - slices[i].dstOffset = offset + i * dpitch; - slices[i].size = width; - } - } - - ggml_vk_sync_buffers(nullptr, subctx); - subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices); - return true; - } - VK_LOG_DEBUG("STAGING"); - - if (!sync_staging) { - // copy was not handled caller needs to fall back - return false; - } - - // Staging buffer required - const size_t staging_size = width * height; - ggml_vk_ensure_sync_staging_buffer(dst->device, staging_size); - - vk_buffer& staging_buffer = dst->device->sync_staging; - - std::vector slices(1); - if (width == dpitch) { - slices[0].srcOffset = 0; - slices[0].dstOffset = offset; - slices[0].size = staging_size; - } else { - slices.resize(height); - for (size_t i = 0; i < height; i++) { - slices[i].srcOffset = i * width; - slices[i].dstOffset = offset + i * dpitch; - slices[i].size = width; - } - } - - ggml_vk_sync_buffers(nullptr, subctx); - subctx->s->buffer->buf.copyBuffer((VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, slices); - - if (width == spitch) { - deferred_memcpy((uint8_t *)staging_buffer->ptr, src, staging_size, &subctx->in_memcpys); - } else { - for (size_t i = 0; i < height; i++) { - deferred_memcpy((uint8_t *)staging_buffer->ptr + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys); - } - } - return true; -} - -static bool ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) { - VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")"); - return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, size, 1, sync_staging); -} - -static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height) { - VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")"); - // Buffer is already mapped - if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { - GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); - - if (width == spitch && width == dpitch) { - memcpy((uint8_t *)dst->ptr + offset, src, width * height); - } else { - for (size_t i = 0; i < height; i++) { - memcpy((uint8_t *)dst->ptr + offset + i * dpitch, (const uint8_t *) src + i * spitch, width); - } - } - } else { - std::lock_guard guard(dst->device->mutex); - - vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool); - ggml_vk_ctx_begin(dst->device, subctx); - bool ret = ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, dpitch, width, height, true); - GGML_ASSERT(ret); - ggml_vk_ctx_end(subctx); - - for (auto& cpy : subctx->in_memcpys) { - memcpy(cpy.dst, cpy.src, cpy.n); - } - - for (auto& mset : subctx->memsets) { - memset(mset.dst, mset.val, mset.n); - } - - ggml_vk_submit(subctx, dst->device->fence); - VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); - dst->device->device.resetFences({ dst->device->fence }); - ggml_vk_queue_command_pools_cleanup(dst->device); - } -} - -static void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src, size_t size) { - VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")"); - ggml_vk_buffer_write_2d(dst, offset, src, size, size, size, 1); -} - -static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) { - VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")"); - GGML_ASSERT(width > 0); - GGML_ASSERT(height > 0); - GGML_ASSERT(src != nullptr); - - // TODO: staging_offset is not used - - // Check if dst is pinned memory - vk_buffer buf = nullptr; - size_t buf_offset = 0; - ggml_vk_host_get(src->device, dst, buf, buf_offset); - - std::vector slices(1); - if (width == spitch && width == dpitch) { - // Only do single write if stride is equal - slices[0].srcOffset = offset; - slices[0].dstOffset = buf_offset; - slices[0].size = width * height; - } else { - slices.resize(height); - for (size_t i = 0; i < height; i++) { - slices[i].srcOffset = offset + i * spitch; - slices[i].dstOffset = buf_offset + i * dpitch; - slices[i].size = width; - } - } - - if (buf != nullptr) { - // Memory is pinned, use as staging buffer - ggml_vk_sync_buffers(nullptr, subctx); - subctx->s->buffer->buf.copyBuffer(src->buffer, buf->buffer, slices); - - return true; - } - VK_LOG_DEBUG("STAGING"); - - if (!sync_staging) { - // copy was not handled caller needs to fall back - return false; - } - - // Fall back to staging buffer - const size_t staging_size = width * height; - ggml_vk_ensure_sync_staging_buffer(src->device, staging_size); - - vk_buffer& staging_buffer = src->device->sync_staging; - - std::vector staging_slices(1); - if (width == spitch) { - staging_slices[0].srcOffset = offset; - staging_slices[0].dstOffset = 0; - staging_slices[0].size = staging_size; - } else { - staging_slices.resize(height); - for (size_t i = 0; i < height; i++) { - staging_slices[i].srcOffset = offset + i * spitch; - staging_slices[i].dstOffset = i * width; - staging_slices[i].size = width; - } - } - - ggml_vk_sync_buffers(nullptr, subctx); - subctx->s->buffer->buf.copyBuffer(src->buffer, staging_buffer->buffer, staging_slices); - - if (width == dpitch) { - deferred_memcpy(dst, staging_buffer->ptr, staging_size, &subctx->out_memcpys); - } else { - for (size_t i = 0; i < height; i++) { - deferred_memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) staging_buffer->ptr + i * width, width, &subctx->out_memcpys); - } - } - return true; -} - -static bool ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t size, bool sync_staging = false) { - return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, sync_staging); -} - -static void ggml_vk_buffer_read_2d(vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height) { - VK_LOG_DEBUG("ggml_vk_buffer_read_2d(" << src->buffer << ", " << offset << ", " << width << ", " << height << ")"); - - // If the device is not an UMA device the memory is host-accessible through rebar. While writing - // through PCIe is sufficient fast reading back data from PCIe is slower than going through - // the HW device to host copy path. - if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) { - GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); - - std::lock_guard guard(src->device->mutex); - vk_context subctx = ggml_vk_create_temporary_context(src->device->compute_queue.cmd_pool); - ggml_vk_ctx_begin(src->device, subctx); - subctx->s->buffer->buf.pipelineBarrier( - vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer, - vk::PipelineStageFlagBits::eHost, - {}, - { { vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferWrite, - vk::AccessFlagBits::eHostRead } }, - {}, {}); - ggml_vk_ctx_end(subctx); - ggml_vk_submit(subctx, src->device->fence); - VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), - "vk_buffer_read_2d uma waitForFences"); - src->device->device.resetFences({ src->device->fence }); - ggml_vk_queue_command_pools_cleanup(src->device); - - if (width == spitch && width == dpitch) { - memcpy(dst, (const uint8_t *) src->ptr + offset, width * height); - } else { - for (size_t i = 0; i < height; i++) { - memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) src->ptr + offset + i * spitch, width); - } - } - } else { - std::lock_guard guard(src->device->mutex); - - vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool); - ggml_vk_ctx_begin(src->device, subctx); - bool ret = ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, spitch, dpitch, width, height, true); - GGML_ASSERT(ret); - ggml_vk_ctx_end(subctx); - - ggml_vk_submit(subctx, src->device->fence); - VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read_2d waitForFences"); - src->device->device.resetFences({ src->device->fence }); - ggml_vk_queue_command_pools_cleanup(src->device); - - for (auto& cpy : subctx->out_memcpys) { - memcpy(cpy.dst, cpy.src, cpy.n); - } - } -} - -static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) { - VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")"); - ggml_vk_buffer_read_2d(src, offset, dst, size, size, size, 1); -} - -static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { - VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")"); - // Make sure both buffers are on same device - GGML_ASSERT(src->device == dst->device); - - VkBufferCopy bc{ src_offset, dst_offset, size }; - - vkCmdCopyBuffer(ctx->s->buffer->buf, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc); -} - -static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { - if (src->device == dst->device) { - std::lock_guard guard(src->device->mutex); - VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")"); - // Copy within the device - vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool); - ggml_vk_ctx_begin(src->device, subctx); - ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size); - ggml_vk_ctx_end(subctx); - ggml_vk_submit(subctx, src->device->fence); - VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences"); - src->device->device.resetFences({ src->device->fence }); - ggml_vk_queue_command_pools_cleanup(src->device); - } else { - VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")"); - // Copy device to device - ggml_vk_ensure_sync_staging_buffer(src->device, size); - - // Copy to src staging buffer - ggml_vk_buffer_copy(src->device->sync_staging, 0, src, src_offset, size); - // Copy to dst buffer - ggml_vk_buffer_write(dst, dst_offset, src->device->sync_staging->ptr, size); - } -} - -static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) { - VK_LOG_DEBUG("ggml_vk_buffer_memset_async(" << offset << ", " << c << ", " << size << ")"); - - if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && - dst->device->uma) { - deferred_memset((uint8_t*)dst->ptr + offset, c, size, &ctx->memsets); - return; - } - - // Fall back to GPU fillBuffer for non-UMA or non-host-visible buffers - ctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c); -} - -static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) { - VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")"); - - if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && - dst->device->uma) { - memset((uint8_t*)dst->ptr + offset, c, size); - return; - } - - std::lock_guard guard(dst->device->mutex); - vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool); - ggml_vk_ctx_begin(dst->device, subctx); - subctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c); - ggml_vk_ctx_end(subctx); - - ggml_vk_submit(subctx, dst->device->fence); - VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences"); - dst->device->device.resetFences({ dst->device->fence }); - ggml_vk_queue_command_pools_cleanup(dst->device); -} - -static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, uint32_t m, uint32_t n, uint32_t k, bool disable_split_k, const vk_pipeline& pipeline) { - VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ", " << disable_split_k << ")"); - - if (disable_split_k) { - return 1; - } - - uint32_t split_k = 1; - if (ctx->device->shader_core_count != 0 && m >= pipeline->wg_denoms[0] && n >= pipeline->wg_denoms[1]) { - // If k is 'large' and the SMs will fill less than halfway, use split_k. - uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]); - uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]); - - if (k >= 2048) { - if (m_tiles * n_tiles <= ctx->device->shader_core_count / 2) { - split_k = ctx->device->shader_core_count / (m_tiles * n_tiles); - } else if (m_tiles * n_tiles <= ctx->device->shader_core_count * 2 / 3) { - split_k = 3; - } - // Cap the split at 8x. Unless k is huge this is a lot of overhead. - split_k = std::min(split_k, 8u); - - // ggml_vk_matmul will align the splits to be a multiple of 256. - // If this rounded up size would cause the last split to be empty, - // then reduce the split count. - while (true) { - if (split_k == 1) { - break; - } - uint32_t k_split = CEIL_DIV(k, split_k); - k_split = ROUNDUP_POW2(k_split, 256); - if (k_split * (split_k - 1) < k) { - break; - } - split_k--; - } - } - } - - return split_k; -} - -static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type, ggml_type src1_type) { - VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); - - // The q8_1 (integer dot) mmq path uses a different shader with its own - // shared-memory layout, so use the int-specific availability flags. - const bool is_q8_1 = (src1_type == GGML_TYPE_Q8_1); - const bool mm_l = is_q8_1 ? ctx->device->mul_mat_l_int[src0_type] : ctx->device->mul_mat_l[src0_type]; - const bool mm_m = is_q8_1 ? ctx->device->mul_mat_m_int[src0_type] : ctx->device->mul_mat_m[src0_type]; - const bool mm_s = is_q8_1 ? ctx->device->mul_mat_s_int[src0_type] : ctx->device->mul_mat_s[src0_type]; - - if (ctx->device->coopmat2) { - const uint32_t shader_core_count = ctx->device->shader_core_count; - const uint32_t tiles_l = CEIL_DIV(m, mmp->a_l->wg_denoms[0]) * CEIL_DIV(n, mmp->a_l->wg_denoms[1]); - const uint32_t tiles_m = CEIL_DIV(m, mmp->a_m->wg_denoms[0]) * CEIL_DIV(n, mmp->a_m->wg_denoms[1]); - - // Use large shader when the N dimension is greater than the medium shader's tile size - uint32_t crossover_large = mmp->m->wg_denoms[1]; - - // Prefer large over medium if either: - // - medium or large tiles would overfill the GPU - // - large tiles with a split_k==3 fits in the GPU and medium tiles with split_k==2 does not - // (medium with split_k==2 is probably better if it fits - more workgroups running and less split_k overhead) - bool prefer_large = tiles_m > shader_core_count || tiles_l > shader_core_count || - // split_k==3 with large tiles likely better than medium tiles with no split_k. - (tiles_l <= shader_core_count / 3 && tiles_m > shader_core_count / 2); - - if ((mm_l && (n > crossover_large && prefer_large)) || (!mm_m && !mm_s)) { - return aligned ? mmp->a_l : mmp->l; - } - // Use medium shader when the N dimension is greater than the small shader's tile size - uint32_t crossover_medium = mmp->s->wg_denoms[1]; - if ((mm_m && (n > crossover_medium)) || !mm_s) { - return aligned ? mmp->a_m : mmp->m; - } - return aligned ? mmp->a_s : mmp->s; - } - - if ((mm_s && (m <= 32 || n <= 32)) || (!mm_m && !mm_l)) { - return aligned ? mmp->a_s : mmp->s; - } - if ((mm_m && (m <= 64 || n <= 64)) || !mm_l) { - return aligned ? mmp->a_m : mmp->m; - } - return aligned ? mmp->a_l : mmp->l; -} - -static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type, ggml_type src1_type) { - VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); - return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true, src0_type, src1_type)->align; -} - -static void ggml_vk_matmul( - ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, - vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, - uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, - uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, - uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, - uint32_t padded_n) { - VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", padded_n: " << padded_n << ")"); - if (split_k == 1) { - ggml_pipeline_request_descriptor_sets(ctx, pipeline, CEIL_DIV(batch, ctx->device->properties.limits.maxComputeWorkGroupCount[2])); - - uint32_t base_work_group_z = 0; - while (base_work_group_z < batch) { - uint32_t groups_z = std::min(batch - base_work_group_z, ctx->device->properties.limits.maxComputeWorkGroupCount[2]); - - const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k, ne02, ne12, broadcast2, broadcast3, padded_n }; - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, groups_z }); - base_work_group_z += groups_z; - } - return; - } - - if (ctx->prealloc_split_k_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - - GGML_ASSERT(batch_stride_d == m * n); - - // Round the split size up to a multiple of 256 (k-quant alignment) - uint32_t k_split = CEIL_DIV(k, split_k); - k_split = ROUNDUP_POW2(k_split, 256); - - ggml_pipeline_request_descriptor_sets(ctx, pipeline, CEIL_DIV(batch, ctx->device->properties.limits.maxComputeWorkGroupCount[2])); - - uint32_t base_work_group_z = 0; - while (base_work_group_z < batch) { - uint32_t groups_z = std::min(batch - base_work_group_z, ctx->device->properties.limits.maxComputeWorkGroupCount[2]); - - const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k_split, ne02, ne12, broadcast2, broadcast3, padded_n }; - // Make sure enough workgroups get assigned for split k to work - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, groups_z }); - base_work_group_z += groups_z; - } - ggml_vk_sync_buffers(ctx, subctx); - const std::array pc2 = { (uint32_t)(m * n * batch), split_k }; - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 }); - ctx->prealloc_split_k_need_sync = true; -} - -static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type, ggml_type src1_type) { - VK_LOG_DEBUG("ggml_vk_guess_matmul_id_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); - - // The q8_1 (integer dot) mmq path uses a different shader with its own - // shared-memory layout, so use the int-specific availability flags. - const bool is_q8_1 = (src1_type == GGML_TYPE_Q8_1); - const bool mm_l = is_q8_1 ? ctx->device->mul_mat_id_l_int[src0_type] : ctx->device->mul_mat_id_l[src0_type]; - const bool mm_m = is_q8_1 ? ctx->device->mul_mat_id_m_int[src0_type] : ctx->device->mul_mat_id_m[src0_type]; - const bool mm_s = is_q8_1 ? ctx->device->mul_mat_id_s_int[src0_type] : ctx->device->mul_mat_id_s[src0_type]; - - if (ctx->device->coopmat2) { - // Use large shader when the N dimension is greater than the medium shader's tile size - uint32_t crossover_large = mmp->m->wg_denoms[1]; - if ((mm_l && (n > crossover_large)) || (!mm_m && !mm_s)) { - return aligned ? mmp->a_l : mmp->l; - } - // Use medium shader when the N dimension is greater than the small shader's tile size - uint32_t crossover_medium = mmp->s->wg_denoms[1]; - if ((mm_m && (n > crossover_medium)) || !mm_s) { - return aligned ? mmp->a_m : mmp->m; - } - return aligned ? mmp->a_s : mmp->s; - } - - if ((mm_s && (m <= 32 || n <= 32)) || (!mm_m && !mm_l)) { - return aligned ? mmp->a_s : mmp->s; - } - if ((mm_m && (m <= 64 || n <= 64)) || !mm_l) { - return aligned ? mmp->a_m : mmp->m; - } - return aligned ? mmp->a_l : mmp->l; -} - -static uint32_t ggml_vk_guess_matmul_id_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type, ggml_type src1_type) { - VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); - return ggml_vk_guess_matmul_id_pipeline(ctx, mmp, m, n, true, src0_type, src1_type)->align; -} - -static void ggml_vk_matmul_id( - ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, - vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids, const vk_subbuffer & expert_count_buf, - uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, - uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, - uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11, - uint32_t padded_n) { - VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), expert_count: (" << expert_count_buf.buffer->buffer << ", " << expert_count_buf.offset << ", " << expert_count_buf.size << "), " << - "m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " << - "batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " << - "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")"); - const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, - nei0, nei1, nbi1, ne11, padded_n }; - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids, expert_count_buf }, pc, { m, nei1, n_as }); -} - -static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { - return - tensor->nb[0] == ggml_type_size(tensor->type) && - tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && - (tensor->ne[3] == 1 || tensor->nb[3] == tensor->nb[2]*tensor->ne[2]); -} - -static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) { - - // Choose "contiguous copy" shader if src/dst are contiguous - bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst)); - - // Use optimized "transpose" shader if src dim1 is the innermost dimension. - bool transpose = dst && src->nb[1] == ggml_type_size(to) && ggml_are_same_shape(dst, src); - - if (transpose && src->type == to) { - if (ggml_type_size(to) == 4) { - return ctx->device->pipeline_cpy_transpose_32; - } else if (ggml_type_size(to) == 2) { - return ctx->device->pipeline_cpy_transpose_16; - } - } - - if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) { - if (contig) { - return ctx->device->pipeline_contig_cpy_f32_f32; - } else { - return ctx->device->pipeline_cpy_f32_f32; - } - } - if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F16) { - if (contig) { - return ctx->device->pipeline_contig_cpy_f32_f16; - } else { - return ctx->device->pipeline_cpy_f32_f16; - } - } - if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F16) { - if (contig) { - return ctx->device->pipeline_contig_cpy_f16_f16; - } else { - return ctx->device->pipeline_cpy_f16_f16; - } - } - if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F32) { - if (contig) { - return ctx->device->pipeline_contig_cpy_f16_f32; - } else { - return ctx->device->pipeline_cpy_f16_f32; - } - } - if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_BF16) { - if (contig) { - return ctx->device->pipeline_contig_cpy_f32_bf16; - } else { - return ctx->device->pipeline_cpy_f32_bf16; - } - } - if (src->type == GGML_TYPE_BF16 && to == GGML_TYPE_F32) { - if (contig) { - return ctx->device->pipeline_contig_cpy_bf16_f32; - } else { - return ctx->device->pipeline_cpy_bf16_f32; - } - } - if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_I32) { - if (contig) { - return ctx->device->pipeline_contig_cpy_f32_i32; - } else { - return ctx->device->pipeline_cpy_f32_i32; - } - } - if (src->type == GGML_TYPE_I32 && to == GGML_TYPE_F32) { - if (contig) { - return ctx->device->pipeline_contig_cpy_i32_f32; - } else { - return ctx->device->pipeline_cpy_i32_f32; - } - } - if (src->type == GGML_TYPE_F32) { - switch (to) { - case GGML_TYPE_Q1_0: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_IQ4_NL: - return ctx->device->pipeline_cpy_f32_quant[to]; - default: - break; - } - } - - if (to == GGML_TYPE_F32) { - switch (src->type) { - case GGML_TYPE_Q1_0: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_IQ4_NL: - return ctx->device->pipeline_cpy_quant_f32[src->type]; - default: - break; - } - } - - if (src->type == to) { - // Copy two or four bytes at a time, depending on block size. - // For quantized types, we scale by block size/type size. But - // this path is also used for bf16->bf16 for example, where the - // type size must be exactly 2 or 4. - GGML_ASSERT(ggml_is_quantized(to) || ggml_type_size(src->type) == 2 || ggml_type_size(src->type) == 4); - if ((ggml_type_size(src->type) % 4) == 0) { - if (contig) { - return ctx->device->pipeline_contig_cpy_f32_f32; - } else { - return ctx->device->pipeline_cpy_f32_f32; - } - } else { - if (contig) { - return ctx->device->pipeline_contig_cpy_f16_f16; - } else { - return ctx->device->pipeline_cpy_f16_f16; - } - } - } - - std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl; - GGML_ABORT("fatal error"); -} - -static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, const vk_subbuffer & in, const vk_subbuffer & out) { - VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; - std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); - - const uint32_t ne = ggml_nelements(tensor); - std::array elements; - - if (ne > 262144) { - elements = { 512, 512, CEIL_DIV(ne, 262144) }; - } else if (ne > 512) { - elements = { 512, CEIL_DIV(ne, 512), 1 }; - } else { - elements = { ne, 1, 1 }; - } - - vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne); - pc.nb10 = 1; - pc.nb11 = (uint32_t)tensor->ne[0]; - pc.nb12 = (uint32_t)(tensor->ne[0] * tensor->ne[1]); - pc.nb13 = (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]); - init_pushconst_fastdiv(pc); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements); - ggml_vk_sync_buffers(ctx, subctx); -} - -// Copy/convert tensor into a caller-defined dense layout. Destination strides -// are in output elements, not bytes. -static void ggml_vk_cpy_to_strided( - ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, - const vk_subbuffer & in, const vk_subbuffer & out, - uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13) { - VK_LOG_DEBUG("ggml_vk_cpy_to_strided((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; - std::cerr << "dst_nb=(" << nb10 << ", " << nb11 << ", " << nb12 << ", " << nb13 << "), buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); - - const uint32_t ne = ggml_nelements(tensor); - std::array elements; - - if (ne > 262144) { - elements = { 512, 512, CEIL_DIV(ne, 262144) }; - } else if (ne > 512) { - elements = { 512, CEIL_DIV(ne, 512), 1 }; - } else { - elements = { ne, 1, 1 }; - } - - vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne); - pc.nb10 = nb10; - pc.nb11 = nb11; - pc.nb12 = nb12; - pc.nb13 = nb13; - init_pushconst_fastdiv(pc); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements); - ggml_vk_sync_buffers(ctx, subctx); -} - -static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) { - switch(type) { - case GGML_TYPE_Q8_1: - return ctx->device->pipeline_quantize_q8_1_x4; - default: - std::cerr << "Missing quantize pipeline for type: " << ggml_type_name(type) << std::endl; - GGML_ABORT("fatal error"); - } -} - -static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& subctx, const vk_subbuffer & in, const vk_subbuffer & out, uint32_t ne) { - VK_LOG_DEBUG("ggml_vk_quantize_q8_1(" << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ", " << ne << ")"); - - vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); - - const uint32_t num_blocks = CEIL_DIV(ne, pipeline->wg_denoms[0]); - // clamp the number of elements to the max workgroup count. The shader will iterate over the total number of blocks. - const uint64_t max_elements = std::min(uint64_t{ctx->device->properties.limits.maxComputeWorkGroupCount[0]} * pipeline->wg_denoms[0], std::numeric_limits::max()); - const uint32_t elements = std::min(ne, static_cast(max_elements)); - - const vk_quantize_q8_1_push_constants pc = { - ne, - num_blocks, - }; - - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, { elements, 1, 1 }); - ggml_vk_sync_buffers(ctx, subctx); -} - -static vk_pipeline ggml_vk_get_64b_indexing_pipeline(ggml_backend_vk_context * ctx, vk_pipeline &pipeline) { - GGML_UNUSED(ctx); -#if defined(VK_EXT_shader_64bit_indexing) - vk_pipeline *ptr = &pipeline; - while (*ptr) { - if ((*ptr)->is_64b_indexing) { - return *ptr; - } - ptr = &(*ptr)->next; - } -#endif - return pipeline; -} - -static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool disable_split_k) { - VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << ggml_type_name(src0->type) << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; - std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << ggml_type_name(src1->type) << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << ggml_type_name(dst->type) << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; - std::cerr << "))"); - GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT - GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT - - const uint64_t ne00 = src0->ne[0]; - const uint64_t ne01 = src0->ne[1]; - const uint64_t ne02 = src0->ne[2]; - const uint64_t ne03 = src0->ne[3]; - - const uint64_t ne10 = src1->ne[0]; - const uint64_t ne11 = src1->ne[1]; - const uint64_t ne12 = src1->ne[2]; - const uint64_t ne13 = src1->ne[3]; - - const uint64_t ne21 = dst->ne[1]; - const uint32_t stride_d = dst->nb[1] / ggml_type_size(dst->type); - const uint32_t stride_batch_d = stride_d*ne21; - - const uint64_t r2 = ne12 / ne02; - const uint64_t r3 = ne13 / ne03; - - ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; - ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; - ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; - - vk_buffer d_Qx = nullptr; - size_t qx_buf_offset = 0; - vk_buffer d_Qy = nullptr; - size_t qy_buf_offset = 0; - - bool src0_uma = false; - bool src1_uma = false; - - if (ctx->device->uma) { - ggml_vk_host_get(ctx->device, src0->data, d_Qx, qx_buf_offset); - ggml_vk_host_get(ctx->device, src1->data, d_Qy, qy_buf_offset); - src0_uma = d_Qx != nullptr; - src1_uma = d_Qy != nullptr; - } - - // Reformat and convert to fp16 if non-contiguous, or for coopmat2 for better perf - const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) || - !ggml_vk_dim01_contiguous(src0); - const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) || - (src0->type == GGML_TYPE_BF16 && src1->type != GGML_TYPE_BF16) || - !ggml_vk_dim01_contiguous(src1); - - // If src0 is BF16, try to use a BF16 x BF16 multiply - ggml_type f16_type = src0->type == GGML_TYPE_BF16 ? GGML_TYPE_BF16 : GGML_TYPE_F16; - - const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; - - bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0; - - // Check for mmq first - vk_matmul_pipeline mmp = quantize_y ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, GGML_TYPE_Q8_1, (ggml_prec)dst->op_params[0]) : nullptr; - - if (mmp == nullptr) { - // Fall back to f16 dequant mul mat - mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? f16_type : src1->type, (ggml_prec)dst->op_params[0]); - quantize_y = false; - } - - const bool qx_needs_dequant = mmp == nullptr || x_non_contig; - const bool qy_needs_dequant = !quantize_y && ((src1->type != f16_type && !y_f32_kernel) || y_non_contig); - - if (qx_needs_dequant) { - // Fall back to dequant + f16 mulmat - mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, f16_type, y_f32_kernel ? GGML_TYPE_F32 : f16_type, (ggml_prec)dst->op_params[0]); - } - - // Not implemented - GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT - - const ggml_type effective_src1_type = quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type); - - const uint32_t kpad = quantize_y ? 0 : ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, qx_needs_dequant ? f16_type : src0->type, effective_src1_type)); - const bool aligned = !quantize_y && ne10 == kpad && ne01 > 8 && ne11 > 8; - - vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? f16_type : src0->type, effective_src1_type); - - if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { - pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); - } - - // Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking - uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) : ne11; - const uint64_t x_ne = ggml_nelements(src0); - // 128 elements per Q8_1 x4 block - const uint64_t y_ne = padded_n * ne10 * ne12 * ne13; - const uint64_t d_ne = ggml_nelements(dst); - - const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, disable_split_k, pipeline); - - const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); - const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); - const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne; - const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); - const uint64_t d_sz = sizeof(float) * d_ne; - - vk_pipeline to_fp16_vk_0 = nullptr; - vk_pipeline to_fp16_vk_1 = nullptr; - vk_pipeline to_q8_1 = nullptr; - - if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, f16_type); - } else { - to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); - } - if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, f16_type); - } else { - to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); - } - GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT - GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT - - if (quantize_y) { - to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); - } - - { - const uint64_t split_k_size = split_k > 1 ? d_sz * split_k : 0; - if ( - (qx_needs_dequant && x_sz > ctx->device->properties.limits.maxStorageBufferRange) || - (qy_needs_dequant && y_sz > ctx->device->properties.limits.maxStorageBufferRange) || - (split_k > 1 && split_k_size > ctx->device->properties.limits.maxStorageBufferRange)) { - GGML_ABORT("Requested preallocation size is too large"); - } - if (qx_needs_dequant && ctx->prealloc_size_x < x_sz) { - ctx->prealloc_size_x = x_sz; - ggml_vk_preallocate_buffers(ctx, subctx); - } - if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz) { - ctx->prealloc_size_y = y_sz; - ggml_vk_preallocate_buffers(ctx, subctx); - } - if (split_k > 1 && ctx->prealloc_size_split_k < split_k_size) { - ctx->prealloc_size_split_k = split_k_size; - ggml_vk_preallocate_buffers(ctx, subctx); - } - - // Request descriptor sets - if (qx_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); - } - if (qy_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); - } - if (quantize_y) { - ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); - } - if (split_k > 1) { - ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1); - } - } - - vk_buffer d_D = dst_buf_ctx->dev_buffer; - const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; - GGML_ASSERT(d_D != nullptr); - GGML_ASSERT(d_D->size >= d_buf_offset + d_sz); - vk_buffer d_X; - uint64_t x_buf_offset = 0; - vk_buffer d_Y; - uint64_t y_buf_offset = 0; - if (!src0_uma) { - d_Qx = src0_buf_ctx->dev_buffer; - qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; - GGML_ASSERT(d_Qx != nullptr); - } - if (!src1_uma) { - d_Qy = src1_buf_ctx->dev_buffer; - qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; - GGML_ASSERT(d_Qy != nullptr); - } - if (qx_needs_dequant) { - d_X = ctx->prealloc_x; - GGML_ASSERT(d_X->size >= x_sz); - } else { - d_X = d_Qx; - x_buf_offset = qx_buf_offset; - GGML_ASSERT(qx_sz == x_sz); - } - if (qy_needs_dequant) { - d_Y = ctx->prealloc_y; - GGML_ASSERT(d_Y->size >= y_sz); - } else if (quantize_y) { - d_Y = ctx->prealloc_y; - GGML_ASSERT(d_Y->size >= CEIL_DIV(y_sz, 144) * 144); - } else { - d_Y = d_Qy; - y_buf_offset = qy_buf_offset; - GGML_ASSERT(qy_sz == y_sz); - } - - if (x_non_contig || qx_needs_dequant) { - if (ctx->prealloc_x_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - } - - if (x_non_contig) { - ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0)); - } else if (qx_needs_dequant) { - const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; - ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_X, 0, x_sz } }, pc, { (uint32_t)(x_ne), 1, 1}); - ggml_vk_sync_buffers(ctx, subctx); - } - if (y_non_contig) { - if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || - ctx->prealloc_y_last_tensor_used != src1 || - ctx->prealloc_y_last_decode_vector_staging) { - if (ctx->prealloc_y_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0)); - ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); - ctx->prealloc_y_last_tensor_used = src1; - ctx->prealloc_y_last_decode_vector_staging = false; - } - } - if (quantize_y) { - if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() || - ctx->prealloc_y_last_tensor_used != src1 || - ctx->prealloc_y_last_decode_vector_staging) { - if (ctx->prealloc_y_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne); - ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); - ctx->prealloc_y_last_tensor_used = src1; - ctx->prealloc_y_last_decode_vector_staging = false; - } - } - - uint32_t stride_batch_x = ne00*ne01; - uint32_t stride_batch_y = ne10*ne11; - - if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { - stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); - } - - if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant && !quantize_y) { - stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); - } - - // compute - ggml_vk_matmul( - ctx, subctx, pipeline, - { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, - ggml_vk_subbuffer(ctx, d_D, d_buf_offset), { ctx->prealloc_split_k, 0, d_sz * split_k }, - ne01, ne11, ne10, - ne10, ne10, stride_d, stride_batch_x, stride_batch_y, stride_batch_d, - split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n - ); // NOLINT - - if (x_non_contig || qx_needs_dequant) { - ctx->prealloc_x_need_sync = true; - } - if (y_non_contig || quantize_y) { - ctx->prealloc_y_need_sync = true; - } -} - -// Device tuning -static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_t n, uint32_t k, ggml_type src0_type) { - if (device->mmvq_mode == 1) { - return true; - } else if (device->mmvq_mode == -1) { - return false; - } - - // q6_k only has 2-byte alignment which makes it somewhat problematic, - // using MMVQ is only a win on Intel. - bool mmvq_q6 = device->vendor_id == VK_VENDOR_ID_INTEL; - if (src0_type == GGML_TYPE_Q6_K && !mmvq_q6) { - return false; - } - - // MMVQ is generally good for batches - if (n > 1) { - return true; - } - - // Quantization overhead is not worth it for small k - switch (device->vendor_id) { - case VK_VENDOR_ID_NVIDIA: - if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_IQ1_S || src0_type == GGML_TYPE_IQ1_M) { - return true; - } - - if (k <= 4096) { - return false; - } - - switch (src0_type) { - case GGML_TYPE_MXFP4: - case GGML_TYPE_Q8_0: - return device->architecture == vk_device_architecture::NVIDIA_PRE_TURING; - default: - return true; - } - case VK_VENDOR_ID_AMD: - if (k < 2048) { - return false; - } - - switch (src0_type) { - case GGML_TYPE_Q8_0: - return device->architecture == vk_device_architecture::AMD_GCN; - default: - return true; - } - case VK_VENDOR_ID_INTEL: - if (device->architecture == vk_device_architecture::INTEL_XE2) { - if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q6_K) { - return true; - } - } - - if (device->driver_id == vk::DriverId::eIntelProprietaryWindows) { - // Intel Windows proprietary driver MMVQ performance for !Q2/Q3/Q6 is worse than fp16, - // see https://github.com/ggml-org/llama.cpp/issues/17628 and - // https://github.com/ggml-org/llama.cpp/pull/23056 - return false; - } - - if (k < 2048) { - return false; - } - - switch (src0_type) { - // From tests on A770 Linux, may need more tuning - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q5_1: - return false; - default: - return true; - } - default: - return true; - } - - GGML_UNUSED(m); -} - -static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { - ggml_tensor * dst = cgraph->nodes[node_idx]; - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - - VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; - std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; - std::cerr << ")),)"); - GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT - GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT - - const uint64_t ne00 = src0->ne[0]; - const uint64_t ne01 = src0->ne[1]; - const uint64_t ne02 = src0->ne[2]; - const uint64_t ne03 = src0->ne[3]; - - const uint64_t ne10 = src1->ne[0]; - const uint64_t ne11 = src1->ne[1]; - const uint64_t ne12 = src1->ne[2]; - const uint64_t ne13 = src1->ne[3]; - - const uint64_t ne20 = dst->ne[0]; - const uint64_t ne21 = dst->ne[1]; - // const uint64_t ne22 = dst->ne[2]; - // const uint64_t ne23 = dst->ne[3]; - - const uint64_t r2 = ne12 / ne02; - const uint64_t r3 = ne13 / ne03; - - // batch_n indicates that we need to compute a few vector results, and this assumes - // ne12 and ne13 are 1. It overloads the batch_strides to hold the row strides. - GGML_ASSERT(ne11 == 1 || ne12 * ne13 == 1); - bool batch_n = ne11 > 1; - - const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); - const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); - - const bool f16_f32_kernel = src1->type == GGML_TYPE_F32; - bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0 && ggml_vk_should_use_mmvq(ctx->device, ne01, ne11, ne10, src0->type); - - vk_pipeline to_fp16_vk_0 = nullptr; - vk_pipeline to_fp16_vk_1 = nullptr; - if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type); - } - if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type); - } else { - to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); - } - - // Check for mmq first - vk_pipeline dmmv = quantize_y ? ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, GGML_TYPE_Q8_1, ne11, ne20, ne00) : nullptr; - vk_pipeline to_q8_1 = nullptr; - - if (dmmv == nullptr) { - // Fall back to f16 dequant mul mat - dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type, ne11, ne20, ne00); - quantize_y = false; - } - - if (quantize_y) { - to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); - } - - if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { - dmmv = ggml_vk_get_64b_indexing_pipeline(ctx, dmmv); - } - - const bool qx_needs_dequant = x_non_contig; - const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig); - - // Not implemented - GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT - - GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT - GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT - GGML_ASSERT(dmmv != nullptr); - - const uint64_t x_ne = ggml_nelements(src0); - const uint64_t y_ne = ggml_nelements(src1); - - const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); - const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; - const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : - (f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); - - { - if ( - (qx_needs_dequant && x_sz > ctx->device->properties.limits.maxStorageBufferRange) || - (qy_needs_dequant && y_sz > ctx->device->properties.limits.maxStorageBufferRange)) { - GGML_ABORT("Requested preallocation size is too large"); - } - if (qx_needs_dequant && ctx->prealloc_size_x < x_sz) { - ctx->prealloc_size_x = x_sz; - ggml_vk_preallocate_buffers(ctx, subctx); - } - if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz) { - ctx->prealloc_size_y = y_sz; - ggml_vk_preallocate_buffers(ctx, subctx); - } - - // Request descriptor sets - if (qx_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); - } - if (qy_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); - } - if (quantize_y) { - ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); - } - } - - vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]); - vk_subbuffer d_Qx = ggml_vk_tensor_subbuffer(ctx, src0); - vk_subbuffer d_Qy = ggml_vk_tensor_subbuffer(ctx, src1); - vk_subbuffer d_X, d_Y; - - if (qx_needs_dequant) { - d_X = { ctx->prealloc_x, 0, ctx->prealloc_x->size }; - } else { - d_X = d_Qx; - GGML_ASSERT(qx_sz == x_sz); - } - if (qy_needs_dequant || quantize_y) { - d_Y = { ctx->prealloc_y, 0, ctx->prealloc_y->size }; - } else { - d_Y = d_Qy; - } - - if (x_non_contig) { - if (ctx->prealloc_x_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - - GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); - ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, d_Qx, d_X); - } - if (y_non_contig) { - GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); - if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || - ctx->prealloc_y_last_tensor_used != src1 || - ctx->prealloc_y_last_decode_vector_staging) { - if (ctx->prealloc_y_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, d_Qy, d_Y); - ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); - ctx->prealloc_y_last_tensor_used = src1; - ctx->prealloc_y_last_decode_vector_staging = false; - } - } - if (quantize_y) { - if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() || - ctx->prealloc_y_last_tensor_used != src1 || - ctx->prealloc_y_last_decode_vector_staging) { - if (ctx->prealloc_y_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - ggml_vk_quantize_q8_1(ctx, subctx, d_Qy, d_Y, y_ne); - ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); - ctx->prealloc_y_last_tensor_used = src1; - ctx->prealloc_y_last_decode_vector_staging = false; - } - } - - // For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride - uint32_t stride_batch_x = batch_n ? 0 : ne00*ne01; - uint32_t stride_batch_y = batch_n ? ne10 : (ne10*ne11); - uint32_t stride_batch_d = batch_n ? ne20 : (ne20*ne21); - - if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { - stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); - } - - if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) { - stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); - } - - const uint32_t max_groups_x = ctx->device->properties.limits.maxComputeWorkGroupCount[0]; - - uint32_t groups_x = ne01; - uint32_t groups_z = 1; - - if (ne01 > max_groups_x) { - groups_z = 64; - groups_x = CEIL_DIV(groups_x, groups_z); - } - - uint32_t fusion_flags = 0; - - vk_subbuffer d_F0 = d_D; - if (ctx->num_additional_fused_ops > 0) { - const ggml_tensor * add = cgraph->nodes[node_idx + 1]; - const ggml_tensor * bias = add->src[0] == dst ? add->src[1] : add->src[0]; - - d_F0 = ggml_vk_tensor_subbuffer(ctx, bias); - fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS0; - } - - vk_subbuffer d_F1 = d_D; - if (ctx->num_additional_fused_ops == 2) { - const ggml_tensor * add = cgraph->nodes[node_idx + 2]; - const ggml_tensor * bias = add->src[0] == cgraph->nodes[node_idx + 1] ? add->src[1] : add->src[0]; - - d_F1 = ggml_vk_tensor_subbuffer(ctx, bias); - fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS1; - } - - ggml_pipeline_request_descriptor_sets(ctx, dmmv, CEIL_DIV(ne12 * ne13, ctx->device->properties.limits.maxComputeWorkGroupCount[1])); - - uint32_t base_work_group_y = 0; - while (base_work_group_y < ne12 * ne13) { - - uint32_t groups_y = std::min((uint32_t)(ne12 * ne13) - base_work_group_y, ctx->device->properties.limits.maxComputeWorkGroupCount[1]); - const vk_mat_vec_push_constants pc = { - (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01, - stride_batch_x, stride_batch_y, stride_batch_d, - fusion_flags, base_work_group_y, - (uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3, - }; - ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, - { - d_X, - d_Y, - d_D, - d_F0, - d_F1, - }, - pc, { groups_x, groups_y, groups_z }); - base_work_group_y += groups_y; - } - - if (x_non_contig) { - ctx->prealloc_x_need_sync = true; - } - if (y_non_contig || quantize_y) { - ctx->prealloc_y_need_sync = true; - } -} - -static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { - ggml_tensor * dst = cgraph->nodes[node_idx]; - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; - std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; - std::cerr << "))"); - GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); - GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT - GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - const uint64_t ne00 = src0->ne[0]; - const uint64_t ne01 = src0->ne[1]; - const uint64_t ne02 = src0->ne[2]; - // const uint64_t ne03 = src0->ne[3]; - - //const uint64_t ne10 = src1->ne[0]; - const uint64_t ne11 = src1->ne[1]; - const uint64_t ne12 = src1->ne[2]; - // const uint64_t ne13 = src1->ne[3]; - - GGML_ASSERT(ne11 == 1); - - // With grouped query attention there are > 1 Q matrices per K, V matrix. - uint32_t gqa_ratio = (uint32_t)ne12 / (uint32_t)ne02; - if (gqa_ratio > 8 || gqa_ratio == 0 || ne12 != ne02 * gqa_ratio) { - gqa_ratio = 1; - } - - vk_pipeline pipeline = ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1]; - - if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { - pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); - } - - { - // Request descriptor sets - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - } - - vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops], true); - vk_subbuffer d_Qx = ggml_vk_tensor_subbuffer(ctx, src0); - vk_subbuffer d_Qy = ggml_vk_tensor_subbuffer(ctx, src1, true); - - vk_subbuffer d_F0 = d_D; - - uint32_t fusion_flags = 0; - - if (ctx->num_additional_fused_ops > 0) { - const ggml_tensor * add = cgraph->nodes[node_idx + 1]; - const ggml_tensor * bias = add->src[0] == dst ? add->src[1] : add->src[0]; - - d_F0 = ggml_vk_tensor_subbuffer(ctx, bias); - fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS0; - } - - vk_subbuffer d_F1 = d_D; - if (ctx->num_additional_fused_ops > 1) { - const ggml_tensor * bias = cgraph->nodes[node_idx + 2]->src[1]; - - d_F1 = ggml_vk_tensor_subbuffer(ctx, bias); - fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS1; - } - - // compute - - vk_mat_vec_p021_push_constants pc = { - (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, - 0, 0, fusion_flags - }; - - init_pushconst_tensor_offsets(ctx, pc, src0, src1, nullptr, nullptr, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]); - - uint32_t workgroups_z = (uint32_t)ne12; - // When gqa_ratio > 1, each invocation does multiple rows and we can launch fewer workgroups - if (gqa_ratio > 1) { - workgroups_z /= gqa_ratio; - } - - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - { - d_Qx, - d_Qy, - d_D, - d_F0, - d_F1, - }, pc, { 1, (uint32_t)ne01, workgroups_z }); -} - -static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { - ggml_tensor * dst = cgraph->nodes[node_idx]; - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; - std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; - std::cerr << "))"); - GGML_ASSERT(!ggml_is_transposed(src0)); - GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(!ggml_is_permuted(src0)); - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - const uint64_t ne00 = src0->ne[0]; - const uint64_t ne01 = src0->ne[1]; - const uint64_t ne02 = src0->ne[2]; - const uint64_t ne03 = src0->ne[3]; - - const uint64_t nb01 = src0->nb[1]; - const uint64_t nb02 = src0->nb[2]; - - const uint64_t nb12 = src1->nb[2]; - - // const uint64_t ne10 = src1->ne[0]; - const uint64_t ne11 = src1->ne[1]; - const uint64_t ne12 = src1->ne[2]; - // const uint64_t ne13 = src1->ne[3]; - - const uint32_t nb03 = (uint32_t)(src0->nb[3] / sizeof(ggml_fp16_t)); - const uint32_t nb13 = (uint32_t)(src1->nb[3] / sizeof(float)); - const uint32_t nb23 = (uint32_t)(dst->nb[3] / sizeof(float)); - - GGML_ASSERT(ne11 == 1); - GGML_ASSERT(src0->ne[3] == src1->ne[3]); // checked in supports_op - - const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t); - const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t); - const uint32_t channel_stride_y = nb12 / sizeof(float); - - vk_pipeline pipeline = ctx->device->pipeline_mul_mat_vec_nc_f16_f32; - if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { - pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); - } - - { - // Request descriptor sets - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - } - - vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops], true); - vk_subbuffer d_Qx = ggml_vk_tensor_subbuffer(ctx, src0); - vk_subbuffer d_Qy = ggml_vk_tensor_subbuffer(ctx, src1, true); - vk_subbuffer d_F0 = d_D; - - uint32_t fusion_flags = 0; - - if (ctx->num_additional_fused_ops > 0) { - const ggml_tensor * add = cgraph->nodes[node_idx + 1]; - const ggml_tensor * bias = add->src[0] == dst ? add->src[1] : add->src[0]; - - d_F0 = ggml_vk_tensor_subbuffer(ctx, bias); - fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS0; - } - - vk_subbuffer d_F1 = d_D; - if (ctx->num_additional_fused_ops > 1) { - const ggml_tensor * bias = cgraph->nodes[node_idx + 2]->src[1]; - - d_F1 = ggml_vk_tensor_subbuffer(ctx, bias); - fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS1; - } - - // compute - vk_mat_vec_nc_push_constants pc = { - (uint32_t)ne00, (uint32_t)ne01, - row_stride_x, channel_stride_x, channel_stride_y, - (uint32_t)(ne12 / ne02), (uint32_t)ne12, - 0, 0, - nb03, nb13, nb23, fusion_flags - }; - - init_pushconst_tensor_offsets(ctx, pc, src0, src1, nullptr, nullptr, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]); - - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - { - d_Qx, - d_Qy, - d_D, - d_F0, - d_F1, - }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 }); -} - -static int ggml_vk_fwht_pipeline_idx(int64_t n) { - switch (n) { - case 64: return 0; - case 128: return 1; - case 256: return 2; - case 512: return 3; - default: return -1; - } -} - -static bool ggml_vk_can_use_fwht(const ggml_backend_vk_context * ctx, const ggml_tensor * src1, const ggml_tensor * dst) { - if (ctx->num_additional_fused_ops != 0) { - return false; - } - - if (ggml_get_op_params_i32(dst, 1) != GGML_HINT_SRC0_IS_HADAMARD) { - return false; - } - - const int idx = ggml_vk_fwht_pipeline_idx(src1->ne[0]); - if (idx < 0 || ctx->device->pipeline_fwht_f32[idx] == nullptr) { - return false; - } - - if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) { - return false; - } - - if (!ggml_is_contiguous(src1)) { - return false; - } - GGML_ASSERT(ggml_is_contiguous(dst)); - - return true; -} - -static void ggml_vk_fwht(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src, ggml_tensor * dst) { - const int idx = ggml_vk_fwht_pipeline_idx(src->ne[0]); - vk_pipeline pipeline = ctx->device->pipeline_fwht_f32[idx]; - - const uint32_t rows_per_workgroup = 4; - const uint32_t n_rows = (uint32_t)ggml_nrows(src); - const uint32_t max_workgroups_x = ctx->device->properties.limits.maxComputeWorkGroupCount[0]; - - const uint32_t total_workgroups = CEIL_DIV(n_rows, rows_per_workgroup); - const uint32_t workgroups_x = std::min(total_workgroups, max_workgroups_x); - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - - const vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src, true); - const vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, true); - - vk_op_fwht_push_constants pc = { - n_rows, - 0, - 0, - 1.0f / std::sqrt((float)src->ne[0]), - }; - init_pushconst_tensor_offsets(ctx, pc, src, nullptr, nullptr, nullptr, dst); - - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src_buf, dst_buf }, pc, { workgroups_x, 1, 1 }); -} - -static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { - ggml_tensor * dst = cgraph->nodes[node_idx]; - ggml_tensor * src0 = dst->src[0]; - ggml_tensor * src1 = dst->src[1]; - VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")"); - - // Handle huge A matrix by splitting the M dimensions. This works well for convolution use cases - // where the M dimension is very large. - // Split_k doesn't work with M splitting. - // This only supports batchsize == 1. - const size_t nbytes = ggml_nbytes(src0); - const bool needs_split = dst->ne[2] == 1 && dst->ne[3] == 1 && nbytes > ctx->device->properties.limits.maxStorageBufferRange; - if (needs_split) { - // Choose the number of rows that can fit (and divide by two, to allow for any additional offsets) - const uint32_t M_split = ctx->device->properties.limits.maxStorageBufferRange / (2 * src0->nb[1]); - uint32_t m_offset = 0; - while (m_offset < dst->ne[0]) { - const uint32_t cur_M_size = std::min(M_split, (uint32_t)(dst->ne[0] - m_offset)); - ggml_tensor dst2 = *dst; - ggml_tensor src02 = *src0; - - dst2.view_src = dst->view_src ? dst->view_src : dst; - src02.view_src = src0->view_src ? src0->view_src : src0; - - dst2.view_offs += m_offset * dst->nb[0]; - src02.view_offs += m_offset * src0->nb[1]; - dst2.ne[0] = cur_M_size; - src02.ne[1] = cur_M_size; - - ggml_vk_mul_mat_q_f16(ctx, subctx, &src02, src1, &dst2, true); - - m_offset += cur_M_size; - } - } else if (ggml_vk_can_use_fwht(ctx, src1, dst)) { - ggml_vk_fwht(ctx, subctx, src1, dst); - } else if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1 && - // detect 0213 permutation, and batch size of 1 - src0->nb[0] <= src0->nb[2] && - src0->nb[2] <= src0->nb[1] && - src0->nb[1] <= src0->nb[3] && - src1->nb[0] <= src1->nb[2] && - src1->nb[2] <= src1->nb[1] && - src1->nb[1] <= src1->nb[3] && - src0->ne[3] == 1 && - src1->ne[3] == 1 && - src0->ne[1] <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] && - src1->ne[2] <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]) { - ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, cgraph, node_idx); - } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 && - !ggml_is_permuted(src0) && !ggml_is_permuted(src1) && - src0->ne[3] <= ctx->device->properties.limits.maxComputeWorkGroupCount[0] && - src0->ne[1] <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] && - src1->ne[2] <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]) { - ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, cgraph, node_idx); - // mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four) - // when ne12 and ne13 are one. - } else if ((dst->ne[1] == 1 || (dst->ne[1] <= mul_mat_vec_max_cols && src1->ne[2] * src1->ne[3] == 1)) && - (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || ggml_is_quantized(src0->type))) { - ggml_vk_mul_mat_vec_q_f16(ctx, subctx, cgraph, node_idx); - } else { - ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, false); - } -} - -static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { - VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; - std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); - GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT - GGML_ASSERT(ids->type == GGML_TYPE_I32); - - const uint64_t ne00 = src0->ne[0]; - const uint64_t ne01 = src0->ne[1]; - const uint64_t ne02 = src0->ne[2]; - // const uint64_t ne03 = src0->ne[3]; - - const uint64_t ne10 = src1->ne[0]; - const uint64_t ne11 = src1->ne[1]; - const uint64_t ne12 = src1->ne[2]; - const uint64_t ne13 = src1->ne[3]; - - const uint64_t nei0 = ids->ne[0]; - const uint64_t nei1 = ids->ne[1]; - - const uint32_t nbi0 = ids->nb[0]; - const uint32_t nbi1 = ids->nb[1]; - const uint32_t nbi2 = ids->nb[2]; - - const uint64_t ne20 = dst->ne[0]; - const uint64_t ne21 = dst->ne[1]; - // const uint64_t ne22 = dst->ne[2]; - // const uint64_t ne23 = dst->ne[3]; - - const uint64_t n_as = ne02; - - ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; - ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; - ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; - ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context; - - vk_buffer d_Qx = nullptr; - size_t qx_buf_offset = 0; - vk_buffer d_Qy = nullptr; - size_t qy_buf_offset = 0; - vk_buffer d_ids = nullptr; - size_t ids_buf_offset = 0; - - bool src0_uma = false; - bool src1_uma = false; - bool ids_uma = false; - - if (ctx->device->uma) { - ggml_vk_host_get(ctx->device, src0->data, d_Qx, qx_buf_offset); - ggml_vk_host_get(ctx->device, src1->data, d_Qy, qy_buf_offset); - ggml_vk_host_get(ctx->device, ids->data, d_ids, ids_buf_offset); - src0_uma = d_Qx != nullptr; - src1_uma = d_Qy != nullptr; - ids_uma = d_ids != nullptr; - } - - // Reformat and convert to fp16 if non-contiguous, or for coopmat2 for better perf - const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) || - !ggml_vk_dim01_contiguous(src0); - // If src0 is BF16, try to use a BF16 x BF16 multiply - ggml_type f16_type = src0->type == GGML_TYPE_BF16 ? GGML_TYPE_BF16 : GGML_TYPE_F16; -#if defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT) - // B must already be, or be convertible to, the matmul B type used by this path. - const bool y_decode_vector_supported = ctx->device->coopmat2_decode_vector && - (f16_type != GGML_TYPE_BF16 || ctx->device->coopmat2_bf16_support) && - (src1->type == GGML_TYPE_F32 || src1->type == f16_type); - // If B is copied to prealloc_y, we can choose a 4-element-aligned row stride. - const bool y_decode_vector_uses_prealloc = !ggml_vk_dim01_contiguous(src1) || src1->type != f16_type; - // Direct B reads are safe only if row starts and the original buffer offset are 4-element aligned. - const bool y_decode_vector_aligned = - (ne10 % 4 == 0) && - (y_decode_vector_uses_prealloc || get_misalign_bytes(ctx, src1) % (4 * ggml_type_size(src1->type)) == 0); - // Stage B only when decode-vector is available and direct B reads would be misaligned. - const bool y_decode_vector_staging = y_decode_vector_supported && !y_decode_vector_aligned; -#else - const bool y_decode_vector_staging = false; -#endif - const bool y_non_contig = y_decode_vector_staging || - (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) || - (src0->type == GGML_TYPE_BF16 && src1->type != GGML_TYPE_BF16) || - !ggml_vk_dim01_contiguous(src1); - - const uint32_t y_staged_row_stride = y_decode_vector_staging ? (uint32_t)ggml_vk_align_size(ne10, 4) : (uint32_t)ne10; - - const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; - - bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0; - - // Check for mmq first - vk_matmul_pipeline mmp = quantize_y ? ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, GGML_TYPE_Q8_1, (ggml_prec)dst->op_params[0]) : nullptr; - - if (mmp == nullptr) { - // Fall back to f16 dequant mul mat - mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, y_non_contig ? f16_type : src1->type, (ggml_prec)dst->op_params[0]); - quantize_y = false; - } - - const bool qx_needs_dequant = mmp == nullptr || x_non_contig; - const bool qy_needs_dequant = !quantize_y && ((src1->type != f16_type && !y_f32_kernel) || y_non_contig); - - if (qx_needs_dequant) { - // Fall back to dequant + f16 mulmat - mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, f16_type, y_f32_kernel ? GGML_TYPE_F32 : f16_type, (ggml_prec)dst->op_params[0]); - } - - // Not implemented - GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT - - const ggml_type effective_src1_type = quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type); - - const uint32_t kpad = quantize_y ? 0 : ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1, qx_needs_dequant ? f16_type : src0->type, effective_src1_type)); - const bool aligned = !quantize_y && ne10 == kpad && ne01 > 8 && nei1 > 8; - - vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? f16_type : src0->type, effective_src1_type); - - if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { - pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); - } - // Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking - uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11; - const uint64_t x_ne = ggml_nelements(src0); - const uint64_t y_ne = (uint64_t)y_staged_row_stride * padded_n * ne12 * ne13; - const uint64_t d_ne = ggml_nelements(dst); - - const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); - const uint64_t qy_sz = ggml_type_size(src1->type) * ggml_nelements(src1) / ggml_blck_size(src1->type); - const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne; - const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); - const uint64_t ids_sz = nbi2; - const uint64_t d_sz = sizeof(float) * d_ne; - - vk_pipeline to_fp16_vk_0 = nullptr; - vk_pipeline to_fp16_vk_1 = nullptr; - vk_pipeline to_q8_1 = nullptr; - - auto make_y_staged_dst = [&]() { - ggml_tensor y_staged_dst = *src1; - y_staged_dst.type = f16_type; - y_staged_dst.nb[0] = ggml_type_size(f16_type); - y_staged_dst.nb[1] = y_staged_dst.nb[0] * y_staged_row_stride; - y_staged_dst.nb[2] = y_staged_dst.nb[1] * padded_n; - y_staged_dst.nb[3] = y_staged_dst.nb[2] * y_staged_dst.ne[2]; - return y_staged_dst; - }; - - if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, f16_type); - } else { - to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); - } - if (y_non_contig) { - ggml_tensor y_staged_dst; - const ggml_tensor * y_staged_dst_ptr = nullptr; - if (y_decode_vector_staging) { - y_staged_dst = make_y_staged_dst(); - y_staged_dst_ptr = &y_staged_dst; - } - - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, y_staged_dst_ptr, f16_type); - } else { - to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); - } - GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT - GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT - - if (quantize_y) { - to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); - } - vk_pipeline count_experts = ctx->device->pipeline_count_experts; - - uint32_t expert_count_size = sizeof(uint32_t) * n_as; - - { - if ( - (qx_needs_dequant && x_sz > ctx->device->properties.limits.maxStorageBufferRange) || - (qy_needs_dequant && y_sz > ctx->device->properties.limits.maxStorageBufferRange)) { - GGML_ABORT("Requested preallocation size is too large"); - } - if (qx_needs_dequant && ctx->prealloc_size_x < x_sz) { - ctx->prealloc_size_x = x_sz; - ggml_vk_preallocate_buffers(ctx, subctx); - } - if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz) { - ctx->prealloc_size_y = y_sz; - ggml_vk_preallocate_buffers(ctx, subctx); - } - if (ctx->prealloc_size_split_k < expert_count_size) { - ctx->prealloc_size_split_k = expert_count_size; - ggml_vk_preallocate_buffers(ctx, subctx); - } - - // Request descriptor sets - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - if (qx_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); - } - if (qy_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); - } - if (quantize_y) { - ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); - } - ggml_pipeline_request_descriptor_sets(ctx, count_experts, 1); - } - - vk_buffer d_D = dst_buf_ctx->dev_buffer; - const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; - GGML_ASSERT(d_D != nullptr); - vk_buffer d_X; - uint64_t x_buf_offset = 0; - vk_buffer d_Y; - uint64_t y_buf_offset = 0; - if (!src0_uma) { - d_Qx = src0_buf_ctx->dev_buffer; - qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; - GGML_ASSERT(d_Qx != nullptr); - } - if (!src1_uma) { - d_Qy = src1_buf_ctx->dev_buffer; - qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; - GGML_ASSERT(d_Qy != nullptr); - } - if (!ids_uma) { - d_ids = ids_buf_ctx->dev_buffer; - ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs; - GGML_ASSERT(d_ids != nullptr); - } - if (qx_needs_dequant) { - d_X = ctx->prealloc_x; - GGML_ASSERT(d_X->size >= x_sz); - } else { - d_X = d_Qx; - x_buf_offset = qx_buf_offset; - GGML_ASSERT(qx_sz == x_sz); - } - if (qy_needs_dequant) { - d_Y = ctx->prealloc_y; - GGML_ASSERT(d_Y->size >= y_sz); - } else if (quantize_y) { - d_Y = ctx->prealloc_y; - GGML_ASSERT(d_Y->size >= CEIL_DIV(y_sz, 144) * 144); - } else { - d_Y = d_Qy; - y_buf_offset = qy_buf_offset; - GGML_ASSERT(qy_sz == y_sz); - } - - if (x_non_contig || qx_needs_dequant) { - if (ctx->prealloc_x_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - } - // Count how many times each expert is used - vk_subbuffer expert_count_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0); - if (ctx->prealloc_split_k_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - { - const std::vector pc = { (uint32_t)nei0, - (uint32_t)nei1, - (uint32_t)(nbi0 / ggml_type_size(ids->type)), - (uint32_t)(nbi1 / ggml_type_size(ids->type)), - (uint32_t)(get_misalign_bytes(ctx, ids) / ggml_type_size(ids->type)) }; - ggml_vk_dispatch_pipeline(ctx, subctx, count_experts, - { vk_subbuffer{ d_ids, ids_buf_offset, ids_sz }, expert_count_buf }, pc, { (uint32_t)n_as, 1, 1}); - } - - if (x_non_contig) { - ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0)); - } else if (qx_needs_dequant) { - const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; - ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, - { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_X, 0, x_sz } }, pc, { (uint32_t)x_ne, 1, 1}); - } - if (y_non_contig) { - if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || - ctx->prealloc_y_last_tensor_used != src1 || - ctx->prealloc_y_last_decode_vector_staging != y_decode_vector_staging) { - if (ctx->prealloc_y_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - if (y_decode_vector_staging) { - const ggml_tensor y_staged_dst = make_y_staged_dst(); - const uint32_t y_staged_dst_type_size = ggml_type_size(y_staged_dst.type); - ggml_vk_cpy_to_strided( - ctx, subctx, to_fp16_vk_1, src1, - ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), - (uint32_t)(y_staged_dst.nb[0] / y_staged_dst_type_size), - (uint32_t)(y_staged_dst.nb[1] / y_staged_dst_type_size), - (uint32_t)(y_staged_dst.nb[2] / y_staged_dst_type_size), - (uint32_t)(y_staged_dst.nb[3] / y_staged_dst_type_size)); - } else { - ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0)); - } - ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); - ctx->prealloc_y_last_tensor_used = src1; - ctx->prealloc_y_last_decode_vector_staging = y_decode_vector_staging; - } - } - if (quantize_y) { - if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() || - ctx->prealloc_y_last_tensor_used != src1 || - ctx->prealloc_y_last_decode_vector_staging) { - if (ctx->prealloc_y_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne); - ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); - ctx->prealloc_y_last_tensor_used = src1; - ctx->prealloc_y_last_decode_vector_staging = false; - } - } - ggml_vk_sync_buffers(ctx, subctx); - - uint32_t stride_batch_x = ne00*ne01; - uint32_t stride_b_y = y_decode_vector_staging ? y_staged_row_stride : ne10; - uint32_t stride_batch_y = y_decode_vector_staging ? y_staged_row_stride * padded_n : ne10*ne11; - - if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { - stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); - } - - if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant && !quantize_y) { - stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); - } - - // compute - ggml_vk_matmul_id( - ctx, subctx, pipeline, - { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, - { d_D, d_buf_offset, d_sz }, { d_ids, ids_buf_offset, ids_sz }, expert_count_buf, - ne01, ne21, ne10, ne10, stride_b_y, ne01, - stride_batch_x, stride_batch_y, ne20*ne21, - n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11, padded_n - ); // NOLINT - - if (x_non_contig || qx_needs_dequant) { - ctx->prealloc_x_need_sync = true; - } - if (y_non_contig || quantize_y) { - ctx->prealloc_y_need_sync = true; - } - ctx->prealloc_split_k_need_sync = true; -} - -static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { - ggml_tensor * dst = cgraph->nodes[node_idx]; - ggml_tensor * src0 = dst->src[0]; - ggml_tensor * src1 = dst->src[1]; - ggml_tensor * ids = dst->src[2]; - VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; - std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; - std::cerr << "))"); - GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT - GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT - GGML_ASSERT(ids->type == GGML_TYPE_I32); - - const uint64_t ne00 = src0->ne[0]; - const uint64_t ne01 = src0->ne[1]; - // const uint64_t ne02 = src0->ne[2]; - // const uint64_t ne03 = src0->ne[3]; - - const uint64_t ne10 = src1->ne[0]; - const uint64_t ne11 = src1->ne[1]; - const uint64_t ne12 = src1->ne[2]; - // const uint64_t ne13 = src1->ne[3]; - - const uint64_t nei0 = ids->ne[0]; - const uint64_t nei1 = ids->ne[1]; - const uint32_t nbi1 = (uint32_t)(ids->nb[1] / sizeof(int)); - - const uint64_t ne20 = dst->ne[0]; - const uint64_t ne21 = dst->ne[1]; - // const uint64_t ne22 = dst->ne[2]; - // const uint64_t ne23 = dst->ne[3]; - - const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); - const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); - - const bool f16_f32_kernel = src1->type == GGML_TYPE_F32; - bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0 && ggml_vk_should_use_mmvq(ctx->device, ne01, ne12, ne10, src0->type); - - vk_pipeline to_fp16_vk_0 = nullptr; - vk_pipeline to_fp16_vk_1 = nullptr; - if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type); - } - if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type); - } else { - to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); - } - - // Check for mmq first - vk_pipeline dmmv = quantize_y ? ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, GGML_TYPE_Q8_1, ne20, ne00) : nullptr; - vk_pipeline to_q8_1 = nullptr; - - if (dmmv == nullptr) { - // Fall back to f16 dequant mul mat - dmmv = ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, src1->type, ne20, ne00); - quantize_y = false; - } - - if (quantize_y) { - to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); - } - - const bool qx_needs_dequant = x_non_contig; - const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig); - - if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { - dmmv = ggml_vk_get_64b_indexing_pipeline(ctx, dmmv); - } - - // Not implemented - GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT - GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT - GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT - GGML_ASSERT(dmmv != nullptr); - - const uint64_t x_ne = ggml_nelements(src0); - const uint64_t y_ne = ggml_nelements(src1); - - const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); - const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; - const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : - (f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); - - { - if ( - (qx_needs_dequant && x_sz > ctx->device->properties.limits.maxStorageBufferRange) || - (qy_needs_dequant && y_sz > ctx->device->properties.limits.maxStorageBufferRange)) { - GGML_ABORT("Requested preallocation size is too large"); - } - if (qx_needs_dequant && ctx->prealloc_size_x < x_sz) { - ctx->prealloc_size_x = x_sz; - ggml_vk_preallocate_buffers(ctx, subctx); - } - if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz) { - ctx->prealloc_size_y = y_sz; - ggml_vk_preallocate_buffers(ctx, subctx); - } - - // Request descriptor sets - if (qx_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); - } - if (qy_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); - } - if (quantize_y) { - ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); - } - ggml_pipeline_request_descriptor_sets(ctx, dmmv, nei1); - } - - vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]); - vk_subbuffer d_Qx = ggml_vk_tensor_subbuffer(ctx, src0); - vk_subbuffer d_Qy = ggml_vk_tensor_subbuffer(ctx, src1); - vk_subbuffer d_ids = ggml_vk_tensor_subbuffer(ctx, ids); - vk_subbuffer d_F0 = d_D; - vk_subbuffer d_X, d_Y; - - if (qx_needs_dequant) { - d_X = { ctx->prealloc_x, 0, ctx->prealloc_x->size }; - } else { - d_X = d_Qx; - } - if (qy_needs_dequant || quantize_y) { - d_Y = { ctx->prealloc_y, 0, ctx->prealloc_y->size }; - } else { - d_Y = d_Qy; - } - - if (x_non_contig) { - if (ctx->prealloc_x_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - } - - if (x_non_contig) { - GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); - ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, d_Qx, d_X); - } - if (y_non_contig) { - GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); - if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || - ctx->prealloc_y_last_tensor_used != src1 || - ctx->prealloc_y_last_decode_vector_staging) { - if (ctx->prealloc_y_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, d_Qy, d_Y); - ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); - ctx->prealloc_y_last_tensor_used = src1; - ctx->prealloc_y_last_decode_vector_staging = false; - } - } - if (quantize_y) { - if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() || - ctx->prealloc_y_last_tensor_used != src1 || - ctx->prealloc_y_last_decode_vector_staging) { - if (ctx->prealloc_y_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - ggml_vk_quantize_q8_1(ctx, subctx, d_Qy, d_Y, y_ne); - ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); - ctx->prealloc_y_last_tensor_used = src1; - ctx->prealloc_y_last_decode_vector_staging = false; - } - } - - uint32_t stride_batch_y = ne10*ne11; - - if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) { - stride_batch_y = src1->nb[2] / ggml_type_size(src1->type); - } - - const uint32_t max_groups_x = ctx->device->properties.limits.maxComputeWorkGroupCount[0]; - - uint32_t groups_x = ne01; - uint32_t groups_z = 1; - - if (ne01 > max_groups_x) { - groups_z = 64; - groups_x = CEIL_DIV(groups_x, groups_z); - } - - uint32_t fusion_flags = 0; - - if (ctx->num_additional_fused_ops > 0) { - const ggml_tensor * bias = cgraph->nodes[node_idx + 1]->src[1]; - - d_F0 = ggml_vk_tensor_subbuffer(ctx, bias); - - if (cgraph->nodes[node_idx + 1]->op == GGML_OP_MUL) { - fusion_flags |= MAT_VEC_FUSION_FLAGS_SCALE0; - } else { - GGML_ASSERT(cgraph->nodes[node_idx + 1]->op == GGML_OP_ADD_ID); - fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS0; - } - } - - vk_subbuffer d_F1 = d_D; - if (ctx->num_additional_fused_ops > 1) { - const ggml_tensor * scale = cgraph->nodes[node_idx + 2]->src[1]; - - d_F1 = ggml_vk_tensor_subbuffer(ctx, scale); - fusion_flags |= MAT_VEC_FUSION_FLAGS_SCALE1; - } - - // Loop over the batch dimension - for (uint32_t expert_i1 = 0; expert_i1 < nei1; ++expert_i1) { - const vk_mat_vec_id_push_constants pc = { - (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01, - (uint32_t)(ne00 * ne01), stride_batch_y, (uint32_t)(ne20 * ne21), - fusion_flags, - (uint32_t)nei0, (uint32_t)ne11, expert_i1, nbi1 - }; - ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, - { - d_X, - d_Y, - d_D, - d_F0, - d_F1, - d_ids, - }, - pc, { groups_x, (uint32_t)nei0, groups_z }); - } - - if (x_non_contig) { - ctx->prealloc_x_need_sync = true; - } - if (y_non_contig || quantize_y) { - ctx->prealloc_y_need_sync = true; - } -} - -static bool ggml_vk_use_mul_mat_vec_id(const struct ggml_cgraph * cgraph, int node_idx) { - ggml_tensor * dst = cgraph->nodes[node_idx]; - ggml_tensor * src0 = dst->src[0]; - ggml_tensor * src2 = dst->src[2]; - return (src2->ne[1] <= 8) && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)); -} - -static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { - ggml_tensor * dst = cgraph->nodes[node_idx]; - ggml_tensor * src0 = dst->src[0]; - ggml_tensor * src1 = dst->src[1]; - ggml_tensor * src2 = dst->src[2]; - VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")"); - if (ggml_vk_use_mul_mat_vec_id(cgraph, node_idx)) { - ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, cgraph, node_idx); - } else { - ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst); - } -} - -static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type, ggml_type v_type) { - GGML_UNUSED(f32acc); - GGML_UNUSED(v_type); - // Needs to be kept up to date on shader changes - const uint32_t wg_size = params.workgroup_size; - const uint32_t Br = params.block_rows; - const uint32_t Bc = params.block_cols; - - // BF16 uses the fp32 shader (FLOAT_TYPE=float) - const uint32_t float_type_size = (device->fp16 && k_type != GGML_TYPE_BF16) ? sizeof(ggml_fp16_t) : sizeof(float); - - const bool mmq = ggml_vk_fa_scalar_uses_mmq(device, k_type); - - // tmpsh is overestimated slightly - const uint32_t tmpsh = wg_size * sizeof(float); - const uint32_t tmpshv4 = wg_size * 4 * float_type_size; - - const uint32_t masksh = Bc * (Br + 1) * float_type_size; - - uint32_t Qf, kvsh, kblocksh_size; - if (mmq) { - // block_b_cache: int32_t qs[8] + FLOAT_TYPEV2 ds - const uint32_t block_b_size = 8 * sizeof(int32_t) + 2 * float_type_size; - Qf = Br * (hsk / 32) * block_b_size; - - // kvsh uses D = HSV (K goes through kblocksh instead) - kvsh = params.shmem_staging ? Bc * (hsv / 4 + 1) * 4 * float_type_size : 4 * float_type_size; - - // The mixed MMQ shader uses a superset block_a_cache that fits every - // FA-supported quant: int32_t qs[8] + uint32_t qh + FLOAT_TYPEV2 dm. - // Single-scale types leave dm.y unused; non-Q5_* leave qh unused. - const uint32_t block_a_size = 8 * sizeof(int32_t) + sizeof(uint32_t) + 2 * float_type_size; - kblocksh_size = params.shmem_staging ? Bc * (hsk / 32) * block_a_size : block_a_size; - } else { - Qf = Br * (hsk / 4 + 1) * 4 * float_type_size; - - const uint32_t D = std::max(hsk, hsv); - kvsh = params.shmem_staging ? Bc * (D / 4 + 1) * 4 * float_type_size : 4 * float_type_size; - - kblocksh_size = 0; - } - - const uint32_t total_size = tmpsh + tmpshv4 + masksh + Qf + kvsh + kblocksh_size; - const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; - - VK_LOG_DEBUG("ggml_vk_flash_attn_scalar_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", mmq=" << mmq << ", total_size=" << total_size << ", supported=" << supported); - - return supported; -} - -static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type) { - // Needs to be kept up to date on shader changes - const uint32_t Br = params.block_rows; - const uint32_t Bc = params.block_cols; - - const uint32_t MatBr = 16, MatBc = 16; - - const uint32_t row_split = Bc / MatBc; - - const uint32_t hsk_pad = ROUNDUP_POW2(hsk, 16); - const uint32_t hsv_pad = ROUNDUP_POW2(hsv, 16); - - const uint32_t acctype = f32acc ? 4 : 2; - const uint32_t f16vec4 = 8; - - const uint32_t tmpsh = (Bc / MatBc) * sizeof(float); - - const uint32_t qstride = hsk_pad / 4 + 2; - const uint32_t Qf = Br * qstride * f16vec4; - - const uint32_t psh_stride = Br / 4 + 2; - const uint32_t Psh = Bc * psh_stride * f16vec4; - - const uint32_t sfshstride = (hsk <= 128) ? (Br + 8) : Br; - const uint32_t sfsh = Bc * sfshstride * acctype; - - const uint32_t kvshstride = (params.shmem_staging ? std::max(hsk_pad, hsv_pad) : MatBr) / 4 + 2; - const uint32_t vsh_stride = MatBc / 4 * row_split; - const uint32_t ksh = ((kvshstride >= vsh_stride) ? (Bc * kvshstride) : (Bc * vsh_stride)) * f16vec4; - - // BF16 PVMat accumulator is f32 (no bf16 accumulator support), so pvsh is vec4 (16 bytes) - const uint32_t pvsh_elem_size = (k_type == GGML_TYPE_BF16) ? 16u : f16vec4; - const uint32_t osh_stride = params.row_split * MatBr / 4; - const uint32_t pvsh = MatBc * osh_stride * pvsh_elem_size; - - const uint32_t slope = Br * acctype; - - const uint32_t total_size = tmpsh + Qf + Psh + sfsh + ksh + pvsh + slope; - const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; - - VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", f32acc=" << f32acc << ", total_size=" << total_size << ", supported=" << supported); - - return supported; -} - -static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst) { - VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3]; - std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3]; - std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; - if (sinks) { - std::cerr << "), (" << sinks << ", name=" << sinks->name << ", type=" << sinks->type << ", ne0=" << sinks->ne[0] << ", ne1=" << sinks->ne[1] << ", ne2=" << sinks->ne[2] << ", ne3=" << sinks->ne[3] << ", nb0=" << sinks->nb[0] << ", nb1=" << sinks->nb[1] << ", nb2=" << sinks->nb[2] << ", nb3=" << sinks->nb[3]; - } - std::cerr << "))"); - - GGML_TENSOR_LOCALS(int64_t, neq, q, ne) - GGML_TENSOR_LOCALS(size_t, nbq, q, nb) - GGML_TENSOR_LOCALS(int64_t, nek, k, ne) - GGML_TENSOR_LOCALS(size_t, nbk, k, nb) - GGML_TENSOR_LOCALS(int64_t, nev, v, ne) - GGML_TENSOR_LOCALS(size_t, nbv, v, nb) - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) - GGML_TENSOR_LOCALS(size_t, nb, dst, nb) - - const uint32_t nem0 = mask ? mask->ne[0] : 0; - const uint32_t nem1 = mask ? mask->ne[1] : 0; - const uint32_t nem2 = mask ? mask->ne[2] : 0; - const uint32_t nem3 = mask ? mask->ne[3] : 0; - - const uint32_t HSK = nek0; - const uint32_t HSV = nev0; - uint32_t N = neq1; - const uint32_t KV = nek1; - - GGML_ASSERT(ne0 == HSV); - GGML_ASSERT(ne2 == N); - - // input tensor rows must be contiguous - GGML_ASSERT(nbq0 == ggml_type_size(q->type)); - GGML_ASSERT(nbk0 == ggml_type_size(k->type)); - GGML_ASSERT(nbv0 == ggml_type_size(v->type)); - - GGML_ASSERT(neq0 == HSK); - - GGML_ASSERT(neq1 == N); - - GGML_ASSERT(nev1 == nek1); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - assert(dst->type == GGML_TYPE_F32); - assert(q->type == GGML_TYPE_F32); - uint32_t gqa_ratio = 1; - uint32_t qk_ratio = neq2 / nek2; - uint32_t workgroups_x = (uint32_t)neq1; - uint32_t workgroups_y = (uint32_t)neq2; - uint32_t workgroups_z = (uint32_t)neq3; - - const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32 || k->type == GGML_TYPE_BF16; - - // For scalar/coopmat1 FA, we can use the "large" size to accommodate qga. - // For coopmat2 FA, we always use the small size (which is still pretty large for gqa). - vk_fa_tuning_params tuning_params = get_fa_tuning_params(ctx->device, HSK, HSV, 512, KV, k->type, v->type, f32acc); - const uint32_t max_gqa = std::min(tuning_params.block_rows, 32u); - - if (N <= 8 && qk_ratio > 1 && qk_ratio <= max_gqa && - qk_ratio * nek2 == neq2 && nek2 == nev2 && nem2 <= 1) { - // grouped query attention - make the N dimension equal to gqa_ratio, reduce - // workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1 - // and change addressing calculations to index Q's dimension 2. - gqa_ratio = qk_ratio; - N = gqa_ratio; - workgroups_y /= gqa_ratio; - } - - tuning_params = get_fa_tuning_params(ctx->device, HSK, HSV, N, KV, k->type, v->type, f32acc); - - const uint32_t q_stride = (uint32_t)(nbq1 / ggml_type_size(q->type)); - uint32_t k_stride = (uint32_t)(nbk1 / ggml_type_size(k->type)); - uint32_t v_stride = (uint32_t)(nbv1 / ggml_type_size(v->type)); - - // For F32, the shader treats it as a block of size 4 (for vec4 loads) - if (k->type == GGML_TYPE_F32) { - k_stride /= 4; - } - if (v->type == GGML_TYPE_F32) { - v_stride /= 4; - } - - const uint32_t alignment = tuning_params.block_cols; - bool aligned = (KV % alignment) == 0 && - // the "aligned" shader variant will forcibly align strides, for performance - (q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0; - - // Need to use the coopmat2 variant that clamps loads when HSK/HSV aren't sufficiently aligned. - if (((HSK | HSV) % 16) != 0 && tuning_params.path == FA_COOPMAT2) { - aligned = false; - } - - float scale = 1.0f; - float max_bias = 0.0f; - float logit_softcap = 0.0f; - - memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float)); - memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float)); - memcpy(&logit_softcap, (const float *) dst->op_params + 2, sizeof(float)); - - if (logit_softcap != 0) { - scale /= logit_softcap; - } - - // Only use mask opt when the mask is fairly large. This hasn't been tuned extensively. - bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768 && nem0 >= tuning_params.block_cols * 16; - vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(ctx->device, tuning_params, HSK, HSV, aligned, f32acc, - mask != nullptr, use_mask_opt, logit_softcap != 0, k->type, v->type); - - vk_pipeline pipeline = nullptr; - - { - std::lock_guard guard(ctx->device->compile_mutex); - auto &pipelines = ctx->device->pipeline_flash_attn_f32_f16; - auto it = pipelines.find(fa_pipeline_state); - if (it != pipelines.end()) { - pipeline = it->second; - } else { - pipelines[fa_pipeline_state] = pipeline = std::make_shared(); - } - } - - assert(pipeline); - // Compile early to initialize wg_denoms. - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - - uint32_t split_kv = KV; - uint32_t split_k = 1; - - // Intel Alchemist prefers more workgroups - const uint32_t shader_core_count_multiplier = (ctx->device->vendor_id == VK_VENDOR_ID_INTEL && ctx->device->architecture != INTEL_XE2) ? 2 : 1; - - // Use a placeholder core count if one isn't available. split_k is a big help for perf. - const uint32_t shader_core_count = ctx->device->shader_core_count ? ctx->device->shader_core_count * shader_core_count_multiplier : 16; - - const uint32_t Br = fa_pipeline_state.Br; - const uint32_t Bc = fa_pipeline_state.Bc; - - GGML_ASSERT(Br == pipeline->wg_denoms[0]); - const uint32_t Tr = CEIL_DIV(N, Br); - - // Try to use split_k when KV is large enough to be worth the overhead. - if (gqa_ratio > 1 && workgroups_x <= Br) { - split_k = shader_core_count * 2 / (workgroups_x * workgroups_y * workgroups_z); - } else if (gqa_ratio <= 1) { - uint32_t total_wgs_no_split = Tr * workgroups_y * workgroups_z; - if (total_wgs_no_split < shader_core_count * 2) { - split_k = shader_core_count * 2 / total_wgs_no_split; - } - } - - if (split_k > 1) { - // Try to evenly split KV into split_k chunks, but it needs to be a multiple - // of "align", so recompute split_k based on that. - split_kv = ROUNDUP_POW2(std::max(1u, KV / split_k), alignment); - split_k = CEIL_DIV(KV, split_kv); - } - - // Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1) - // and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows. - // For matrices, the order is (inner to outer) [HSV, ne1, k, ne2, ne3]. - // For L/M, the order is (inner to outer) [ne1, k, ne2, ne3]. - const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne2 * ne3 : 0; - if (split_k_size > ctx->device->properties.limits.maxStorageBufferRange) { - GGML_ABORT("Requested preallocation size is too large"); - } - if (ctx->prealloc_size_split_k < split_k_size) { - ctx->prealloc_size_split_k = split_k_size; - ggml_vk_preallocate_buffers(ctx, subctx); - } - - const uint32_t mask_opt_num_dwords = CEIL_DIV(nem0, 16 * Bc); - const uint64_t mask_opt_size = sizeof(uint32_t) * mask_opt_num_dwords * CEIL_DIV(nem1, Br) * nem2 * nem3; - - vk_pipeline pipeline_fa_mask_opt = nullptr; - if (use_mask_opt) { - { - std::lock_guard guard(ctx->device->compile_mutex); - auto &pipelines = ctx->device->pipeline_fa_mask_opt; - auto it = pipelines.find({Br, Bc}); - if (it != pipelines.end()) { - pipeline_fa_mask_opt = it->second; - } else { - pipelines[{Br, Bc}] = pipeline_fa_mask_opt = std::make_shared(); - } - } - assert(pipeline_fa_mask_opt); - ggml_pipeline_request_descriptor_sets(ctx, pipeline_fa_mask_opt, 1); - - if (ctx->prealloc_size_y < mask_opt_size) { - ctx->prealloc_size_y = mask_opt_size; - ggml_vk_preallocate_buffers(ctx, subctx); - } - if (ctx->prealloc_y_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - } - - const uint32_t n_head_kv = neq2; - const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); - const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - - vk_subbuffer q_buf = ggml_vk_tensor_subbuffer(ctx, q); - vk_subbuffer k_buf = ggml_vk_tensor_subbuffer(ctx, k); - vk_subbuffer v_buf = ggml_vk_tensor_subbuffer(ctx, v); - vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); - vk_subbuffer mask_buf = mask ? ggml_vk_tensor_subbuffer(ctx, mask) : q_buf; - vk_subbuffer sinks_buf = sinks ? ggml_vk_tensor_subbuffer(ctx, sinks) : q_buf; - vk_subbuffer mask_opt_buf = use_mask_opt ? ggml_vk_subbuffer(ctx, ctx->prealloc_y, 0) : q_buf; - - uint32_t mask_n_head_log2 = ((sinks != nullptr) << 24) | n_head_log2; - - if (use_mask_opt) - { - const vk_op_flash_attn_mask_opt_push_constants opt_pc = { - nem0, - nem1, - nem2, - (uint32_t)(mask->nb[1] / sizeof(ggml_fp16_t)), - (uint32_t)(mask->nb[2] / sizeof(ggml_fp16_t)), - (uint32_t)(mask->nb[3] / sizeof(ggml_fp16_t)), - mask_opt_num_dwords, - mask_opt_num_dwords * CEIL_DIV(nem1, Br), - mask_opt_num_dwords * CEIL_DIV(nem1, Br) * nem2, - }; - - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline_fa_mask_opt, - { mask_buf, mask_opt_buf }, opt_pc, - { mask_opt_num_dwords, CEIL_DIV(nem1, Br), nem2 * nem3 }); - ggml_vk_sync_buffers(ctx, subctx); - } - - const vk_flash_attn_push_constants pc = { N, KV, - (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, - (uint32_t)neq2, (uint32_t)neq3, - (uint32_t)nek2, (uint32_t)nek3, - (uint32_t)nev2, (uint32_t)nev3, - nem1, nem2, nem3, - q_stride, (uint32_t)nbq2, (uint32_t)nbq3, - k_stride, (uint32_t)nbk2, (uint32_t)nbk3, - v_stride, (uint32_t)nbv2, (uint32_t)nbv3, - scale, max_bias, logit_softcap, - mask_n_head_log2, m0, m1, - gqa_ratio, split_kv, split_k }; - - if (split_k > 1) { - ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1); - - if (ctx->prealloc_split_k_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - - // We reuse workgroups_x to mean the number of splits, so we need to - // cancel out the divide by wg_denoms[0]. - uint32_t dispatch_x; - if (gqa_ratio > 1) { - workgroups_x *= pipeline->wg_denoms[0]; - dispatch_x = split_k * workgroups_x; - } else { - dispatch_x = Tr * split_k * pipeline->wg_denoms[0]; - } - - vk_subbuffer split_k_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - {q_buf, k_buf, v_buf, mask_buf, sinks_buf, split_k_buf, mask_opt_buf}, - pc, { dispatch_x, workgroups_y, workgroups_z }); - - ggml_vk_sync_buffers(ctx, subctx); - const vk_op_flash_attn_split_k_reduce_push_constants pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, split_k, (sinks != nullptr) }; - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce, - {split_k_buf, sinks_buf, dst_buf}, - pc2, { (uint32_t)ne1, HSV, (uint32_t)(ne2 * ne3) }); - ctx->prealloc_split_k_need_sync = true; - } else { - if (gqa_ratio > 1) { - // When using gqa, we want one actual workgroup per batch, so cancel out wg_denoms - workgroups_x *= pipeline->wg_denoms[0]; - } - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - {q_buf, k_buf, v_buf, mask_buf, sinks_buf, dst_buf, mask_opt_buf}, - pc, { workgroups_x, workgroups_y, workgroups_z }); - } -} - -static vk_conv_shapes ggml_vk_conv_select_shape(ggml_backend_vk_context * ctx, uint32_t K, uint32_t NPQ) { - auto n_tiles = [&](vk_conv_shapes s) { - return CEIL_DIV(K, vk_conv_block_sizes[s].K) - * CEIL_DIV(NPQ, vk_conv_block_sizes[s].NPQ); - }; - - // We can't query number of shader cores on Intel, use 32 as a placeholder - // so small convolutions will still choose a smaller tile. - const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32; - - // 128x128 isn't used with cm1 due to shared memory size; fall through to a smaller tile. - bool allow_128x128 = true; -#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) - if (!ctx->device->coopmat2 && ctx->device->coopmat_support && ctx->device->coopmat_support_16x16x16_f16acc) { - allow_128x128 = false; - } -#endif - - if (allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) { - return CONV_SHAPE_128x128; - } else if (K <= 32 && n_tiles(CONV_SHAPE_32x256) >= shader_core_count * 2) { - return CONV_SHAPE_32x256; - } else if (K <= 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) { - return CONV_SHAPE_64x128; - } else if (!allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) { - // cm1 fallback for large K when 128x128 isn't available - return CONV_SHAPE_64x128; - } else { - return CONV_SHAPE_64x32; - } -} - -static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * dst, ggml_op op) { - switch (op) { - case GGML_OP_GET_ROWS: - GGML_ASSERT(src1->type == GGML_TYPE_I32); - if (src0->type == GGML_TYPE_I32) { - // i32 src only supports i32 result - GGML_ASSERT(dst->type == GGML_TYPE_I32); - return ctx->device->pipeline_get_rows[src0->type]; - } - if (dst->type == GGML_TYPE_F16) { - return ctx->device->pipeline_get_rows[src0->type]; - } - if (dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_get_rows_f32[src0->type]; - } - return nullptr; - case GGML_OP_ACC: - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_acc_f32; - } - return nullptr; - case GGML_OP_SET: - if (src0->type == src1->type && src0->type == dst->type && - (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32)) { - return ctx->device->pipeline_set_f32; - } - return nullptr; - case GGML_OP_ADD: - case GGML_OP_SUB: - case GGML_OP_MUL: - case GGML_OP_DIV: - if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) || - (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) || - (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16)) { - return nullptr; - } - switch (op) { - case GGML_OP_ADD: - { - if (ctx->num_additional_fused_ops > 0) { - if (ctx->do_add_rms_partials) { - return ctx->device->pipeline_multi_add_rms[ctx->num_additional_fused_ops]; - } else { - return ctx->device->pipeline_multi_add[ctx->num_additional_fused_ops]; - } - } - if (ctx->do_add_rms_partials) { - auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_rms_norepeat : ctx->device->pipeline_add_rms; - return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; - } else { - auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_norepeat : ctx->device->pipeline_add; - return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; - } - } - case GGML_OP_SUB: - { - auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_sub_norepeat : ctx->device->pipeline_sub; - return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; - } - case GGML_OP_MUL: - { - auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_mul_norepeat : ctx->device->pipeline_mul; - return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; - } - case GGML_OP_DIV: - { - auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_div_norepeat : ctx->device->pipeline_div; - return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; - } - default: - break; - } - return nullptr; - case GGML_OP_ADD_ID: - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && src2->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_add_id_f32; - } - return nullptr; - case GGML_OP_CONCAT: { - if (src0->type != src1->type || src0->type != dst->type) { - return nullptr; - } - if (ggml_blck_size(src0->type) != 1) { - return nullptr; - } - const size_t type_size = ggml_type_size(src0->type); - switch (type_size) { - case 1: - return ctx->device->pipeline_concat_i8; - case 2: - return ctx->device->pipeline_concat_i16; - case 4: - return ctx->device->pipeline_concat_i32; - case 8: - return ctx->device->pipeline_concat_i64; - default: - return nullptr; - } - } - case GGML_OP_UPSCALE: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - uint32_t mode = (ggml_get_op_params_i32(dst, 0) & (0xFF | GGML_SCALE_FLAG_ANTIALIAS)); - switch (mode) { - case GGML_SCALE_MODE_NEAREST: - return ctx->device->pipeline_upscale_nearest_f32; - case GGML_SCALE_MODE_BILINEAR: - return ctx->device->pipeline_upscale_bilinear_f32; - case GGML_SCALE_MODE_BICUBIC: - return ctx->device->pipeline_upscale_bicubic_f32; - case GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS: - return ctx->device->pipeline_upscale_bilinear_antialias_f32; - default: - return nullptr; - } - } - return nullptr; - case GGML_OP_SCALE: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_scale_f32; - } - return nullptr; - case GGML_OP_SQR: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_sqr_f32; - } - return nullptr; - case GGML_OP_SQRT: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_sqrt_f32; - } - return nullptr; - case GGML_OP_SIN: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_sin_f32; - } - return nullptr; - case GGML_OP_COS: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_cos_f32; - } - return nullptr; - case GGML_OP_LOG: - if (src0->type == dst->type && - (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { - return ctx->device->pipeline_log[dst->type == GGML_TYPE_F16]; - } - return nullptr; - case GGML_OP_TRI: - if (src0->type == dst->type && - (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { - return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16]; - } - return nullptr; - case GGML_OP_DIAG: - if (src0->type == dst->type && - (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { - return ctx->device->pipeline_diag[dst->type == GGML_TYPE_F16]; - } - return nullptr; - case GGML_OP_CLAMP: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_clamp_f32; - } - return nullptr; - case GGML_OP_PAD: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_pad_f32; - } - return nullptr; - case GGML_OP_ROLL: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_roll_f32; - } - return nullptr; - case GGML_OP_REPEAT: - if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) { - return ctx->device->pipeline_repeat_i32; - } - if (ggml_type_size(src0->type) == 2 && ggml_type_size(dst->type) == 2) { - return ctx->device->pipeline_repeat_i16; - } - return nullptr; - case GGML_OP_REPEAT_BACK: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_repeat_back_f32; - } - return nullptr; - case GGML_OP_CPY: - case GGML_OP_CONT: - case GGML_OP_DUP: - return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type); - case GGML_OP_SET_ROWS: - if (src1->type == GGML_TYPE_I64) { - return ctx->device->pipeline_set_rows_i64[dst->type]; - } else { - return ctx->device->pipeline_set_rows_i32[dst->type]; - } - case GGML_OP_SILU_BACK: - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_silu_back_f32; - } - return nullptr; - case GGML_OP_NORM: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_norm_f32; - } - return nullptr; - case GGML_OP_GROUP_NORM: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_group_norm_f32; - } - return nullptr; - case GGML_OP_RMS_NORM: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - if (ctx->do_add_rms_partials) { - return ctx->num_additional_fused_ops > 0 ? ctx->device->pipeline_rms_norm_mul_partials_f32 : ctx->device->pipeline_rms_norm_partials_f32; - } else { - return ctx->num_additional_fused_ops > 0 ? ctx->device->pipeline_rms_norm_mul_f32 : ctx->device->pipeline_rms_norm_f32; - } - } - return nullptr; - case GGML_OP_RMS_NORM_BACK: - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_rms_norm_back_f32; - } - return nullptr; - case GGML_OP_L2_NORM: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_l2_norm_f32; - } - return nullptr; - case GGML_OP_UNARY: - if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) || - (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) || - (src0->type != dst->type)) { - return nullptr; - } - - switch (ggml_get_unary_op(dst)) { - case GGML_UNARY_OP_EXP: - return ctx->device->pipeline_exp[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_EXPM1: - return ctx->device->pipeline_expm1[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_ELU: - return ctx->device->pipeline_elu[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_SILU: - return ctx->device->pipeline_silu[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_GELU: - return ctx->device->pipeline_gelu[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_GELU_ERF: - return ctx->device->pipeline_gelu_erf[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_GELU_QUICK: - return ctx->device->pipeline_gelu_quick[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_RELU: - return ctx->device->pipeline_relu[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_XIELU: - return ctx->device->pipeline_xielu[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_NEG: - return ctx->device->pipeline_neg[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_TANH: - return ctx->device->pipeline_tanh[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_SIGMOID: - return ctx->device->pipeline_sigmoid[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_HARDSIGMOID: - return ctx->device->pipeline_hardsigmoid[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_HARDSWISH: - return ctx->device->pipeline_hardswish[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_ABS: - return ctx->device->pipeline_abs[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_SOFTPLUS: - return ctx->device->pipeline_softplus[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_STEP: - return ctx->device->pipeline_step[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_ROUND: - return ctx->device->pipeline_round[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_CEIL: - return ctx->device->pipeline_ceil[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_FLOOR: - return ctx->device->pipeline_floor[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_TRUNC: - return ctx->device->pipeline_trunc[dst->type == GGML_TYPE_F16]; - case GGML_UNARY_OP_SGN: - return ctx->device->pipeline_sgn[dst->type == GGML_TYPE_F16]; - default: - break; - } - return nullptr; - case GGML_OP_GLU: - if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) || - (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) || - (src0->type != dst->type)) { - return nullptr; - } - - switch (ggml_get_glu_op(dst)) { - case GGML_GLU_OP_GEGLU: - return ctx->device->pipeline_geglu[dst->type == GGML_TYPE_F16]; - case GGML_GLU_OP_REGLU: - return ctx->device->pipeline_reglu[dst->type == GGML_TYPE_F16]; - case GGML_GLU_OP_SWIGLU: - return ctx->device->pipeline_swiglu[dst->type == GGML_TYPE_F16]; - case GGML_GLU_OP_SWIGLU_OAI: - return ctx->device->pipeline_swiglu_oai[dst->type == GGML_TYPE_F16]; - case GGML_GLU_OP_GEGLU_ERF: - return ctx->device->pipeline_geglu_erf[dst->type == GGML_TYPE_F16]; - case GGML_GLU_OP_GEGLU_QUICK: - return ctx->device->pipeline_geglu_quick[dst->type == GGML_TYPE_F16]; - default: - break; - } - return nullptr; - case GGML_OP_DIAG_MASK_INF: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_diag_mask_inf_f32; - } - return nullptr; - case GGML_OP_SOFT_MAX: - GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); - GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); - - if (ctx->num_additional_fused_ops) { - uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0]))); - GGML_ASSERT(idx < num_topk_moe_pipelines); - // use n_experts from push constant if it's not equal to the power of two spec constant - bool use_push = dst->ne[0] != (1u << idx); - return ctx->device->pipeline_topk_moe[idx][use_push]; - } - - if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) { - return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_wg512 : ctx->device->pipeline_soft_max_f32; - } - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { - return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_f16_wg512 : ctx->device->pipeline_soft_max_f32_f16; - } - return nullptr; - case GGML_OP_SOFT_MAX_BACK: - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_soft_max_back_f32; - } - return nullptr; - case GGML_OP_ROPE: - case GGML_OP_ROPE_BACK: - { - const ggml_tensor *rope = ctx->num_additional_fused_ops == 2 ? dst->src[0]->src[0] : dst; - const int mode = ((const int32_t *) rope->op_params)[2]; - const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; - const bool is_vision = mode == GGML_ROPE_TYPE_VISION; - - if (is_neox) { - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_rope_neox_f32; - } - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { - return ctx->device->pipeline_rope_neox_f32_f16; - } - if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - return ctx->device->pipeline_rope_neox_f16; - } - } else if (is_mrope && !is_vision) { - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_rope_multi_f32; - } - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { - return ctx->device->pipeline_rope_multi_f32_f16; - } - if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - return ctx->device->pipeline_rope_multi_f16; - } - } else if (is_vision) { - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_rope_vision_f32; - } - if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - return ctx->device->pipeline_rope_vision_f16; - } - } else { - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_rope_norm_f32; - } - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { - return ctx->device->pipeline_rope_norm_f32_f16; - } - if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - return ctx->device->pipeline_rope_norm_f16; - } - } - return nullptr; - } - case GGML_OP_SUM: - case GGML_OP_SUM_ROWS: - case GGML_OP_MEAN: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_sum_rows_f32; - } - return nullptr; - case GGML_OP_CUMSUM: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - if (src0->ne[0] <= 512) { - return ctx->device->pipeline_cumsum_small_f32; - } else { - return ctx->device->pipeline_cumsum_f32; - } - } - return nullptr; - case GGML_OP_SOLVE_TRI: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - - vk_solve_tri_pipeline_state solve_tri_pipeline_state(src0->ne[0], src1->ne[0]); - - vk_pipeline pipeline = nullptr; - - { - std::lock_guard guard(ctx->device->compile_mutex); - auto it = ctx->device->pipeline_solve_tri_f32.find(solve_tri_pipeline_state); - if (it != ctx->device->pipeline_solve_tri_f32.end()) { - pipeline = it->second; - } else { - ctx->device->pipeline_solve_tri_f32[solve_tri_pipeline_state] = pipeline = std::make_shared(); - } - } - - return pipeline; - } - return nullptr; - case GGML_OP_ARGMAX: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_I32) { - return ctx->device->pipeline_argmax_f32; - } - return nullptr; - case GGML_OP_COUNT_EQUAL: - if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I64) { - return ctx->device->pipeline_count_equal_i32; - } - return nullptr; - case GGML_OP_IM2COL: - if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_im2col_f32; - } - if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { - return ctx->device->pipeline_im2col_f32_f16; - } - return nullptr; - case GGML_OP_IM2COL_3D: - if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_im2col_3d_f32; - } - if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { - return ctx->device->pipeline_im2col_3d_f32_f16; - } - return nullptr; - case GGML_OP_TIMESTEP_EMBEDDING: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_timestep_embedding_f32; - } - return nullptr; - case GGML_OP_CONV_TRANSPOSE_1D: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_conv_transpose_1d_f32; - } - return nullptr; - case GGML_OP_COL2IM_1D: - switch (src0->type) { - case GGML_TYPE_F32: return ctx->device->pipeline_col2im_1d_f32; - case GGML_TYPE_F16: return ctx->device->pipeline_col2im_1d_f16; - case GGML_TYPE_BF16: return ctx->device->pipeline_col2im_1d_bf16; - default: return nullptr; - } - case GGML_OP_POOL_2D: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_pool2d_f32; - } - return nullptr; - case GGML_OP_RWKV_WKV6: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_rwkv_wkv6_f32; - } - return nullptr; - case GGML_OP_RWKV_WKV7: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_rwkv_wkv7_f32; - } - return nullptr; - case GGML_OP_GATED_DELTA_NET: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - const uint32_t S_v = dst->src[2]->ne[0]; - const uint32_t kda = (dst->src[3]->ne[0] == (int64_t)S_v) ? 1 : 0; - uint32_t si; - switch (S_v) { - case 16: si = 0; break; - case 32: si = 1; break; - case 64: si = 2; break; - case 128: si = 3; break; - default: return nullptr; - } - return ctx->device->pipeline_gated_delta_net[si][kda]; - } - return nullptr; - case GGML_OP_SSM_SCAN: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - const uint32_t d_state = src0->ne[0]; - if (d_state == 128) { - return ctx->device->pipeline_ssm_scan_f32_d128; - } else if (d_state == 256) { - return ctx->device->pipeline_ssm_scan_f32_d256; - } - } - return nullptr; - case GGML_OP_SSM_CONV: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - switch (ctx->num_additional_fused_ops) { - case 0: return ctx->device->pipeline_ssm_conv_f32; - case 1: return ctx->device->pipeline_ssm_conv_silu_f32; - case 2: return ctx->device->pipeline_ssm_conv_bias_silu_f32; - default: return nullptr; - } - } - return nullptr; - case GGML_OP_OPT_STEP_ADAMW: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_opt_step_adamw_f32; - } - return nullptr; - case GGML_OP_OPT_STEP_SGD: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_opt_step_sgd_f32; - } - return nullptr; - case GGML_OP_LEAKY_RELU: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_leaky_relu_f32; - } - return nullptr; - case GGML_OP_CONV_2D: - case GGML_OP_CONV_TRANSPOSE_2D: - if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - uint32_t K = dst->ne[2]; // Cout - uint32_t NPQ = dst->ne[3] * dst->ne[1] * dst->ne[0]; // N * OH * OW - vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, K, NPQ); - - bool transpose = dst->op == GGML_OP_CONV_TRANSPOSE_2D; - uint32_t KW = (uint32_t)src0->ne[0]; - uint32_t KH = (uint32_t)src0->ne[1]; - uint32_t s0 = (uint32_t)(ggml_get_op_params_i32(dst, 0)); - uint32_t s1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 1) : s0; - uint32_t p0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 2) : 0; - uint32_t p1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 3) : 0; - uint32_t d0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 4) : 1; - uint32_t d1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 5) : 1; - - // tile-aligned shapes let the shader skip bounds checks - const uint32_t Cin = (uint32_t)src1->ne[2]; - const uint32_t CRS = Cin * KW * KH; - const uint32_t BS_K = vk_conv_block_sizes[shape].K; - const uint32_t BS_CRS = vk_conv_block_sizes[shape].CRS; - const uint32_t BS_NPQ = vk_conv_block_sizes[shape].NPQ; - const uint32_t aligned = ((K % BS_K == 0) && - (CRS % BS_CRS == 0) && - (NPQ % BS_NPQ == 0)) ? 1u : 0u; - - vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH, aligned); - - std::map *pipelines = nullptr; - if (op == GGML_OP_CONV_2D) { - if (src0->type == GGML_TYPE_F32) { - pipelines = &ctx->device->pipeline_conv2d_f32[shape]; - } else if (src0->type == GGML_TYPE_F16) { - pipelines = &ctx->device->pipeline_conv2d_f16_f32[shape]; - } - } else if (op == GGML_OP_CONV_TRANSPOSE_2D) { - if (src0->type == GGML_TYPE_F32) { - pipelines = &ctx->device->pipeline_conv_transpose_2d_f32[shape]; - } else if (src0->type == GGML_TYPE_F16) { - pipelines = &ctx->device->pipeline_conv_transpose_2d_f16_f32[shape]; - } - } - - vk_pipeline pipeline = nullptr; - - { - std::lock_guard guard(ctx->device->compile_mutex); - auto it = pipelines->find(conv2d_pipeline_state); - if (it != pipelines->end()) { - pipeline = it->second; - } else { - (*pipelines)[conv2d_pipeline_state] = pipeline = std::make_shared(); - } - } - - return pipeline; - } - return nullptr; - case GGML_OP_CONV_2D_DW: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - if (ggml_is_contiguous(src1)) { - return ctx->device->pipeline_conv2d_dw_whcn_f32; - } else if (ggml_is_contiguous_channels(src1)) { - return ctx->device->pipeline_conv2d_dw_cwhn_f32; - } - } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { - if (ggml_is_contiguous(src1)) { - return ctx->device->pipeline_conv2d_dw_whcn_f16_f32; - } else if (ggml_is_contiguous_channels(src1)) { - return ctx->device->pipeline_conv2d_dw_cwhn_f16_f32; - } - } - return nullptr; - case GGML_OP_ADD1: - if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - return ctx->device->pipeline_add1_f16_f16; - } - if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { - return ctx->device->pipeline_add1_f16_f32; - } - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_add1_f32_f32; - } - return nullptr; - case GGML_OP_ARANGE: - if (dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_arange_f32; - } - return nullptr; - case GGML_OP_FILL: - if (dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_fill_f32; - } - if (dst->type == GGML_TYPE_F16) { - return ctx->device->pipeline_fill_f16; - } - return nullptr; - default: - return nullptr; - } - - GGML_UNUSED(src2); -} - -template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); - - p.misalign_offsets = (a_offset << 16) | d_offset; - - GGML_UNUSED(src1); - GGML_UNUSED(src2); - GGML_UNUSED(src3); -} - -template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_glu_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); - const uint32_t b_offset = src1 ? get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type) : a_offset; - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); - - GGML_ASSERT(a_offset < (1u << 8)); - GGML_ASSERT(b_offset < (1u << 8)); - GGML_ASSERT(d_offset < (1u << 8)); - - p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset; - - GGML_UNUSED(src2); - GGML_UNUSED(src3); -} - -template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); - - p.misalign_offsets = (a_offset << 16) | d_offset; - - GGML_UNUSED(src1); - GGML_UNUSED(src2); - GGML_UNUSED(src3); -} - -template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_pad_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); - - p.misalign_offsets = (a_offset << 16) | d_offset; - - GGML_UNUSED(src1); - GGML_UNUSED(src2); - GGML_UNUSED(src3); -} - -template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_im2col_3d_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t a_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); - - p.misalign_offsets = (a_offset << 16) | d_offset; - - GGML_UNUSED(src0); - GGML_UNUSED(src2); - GGML_UNUSED(src3); -} - -template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); - const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); - - GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0)); - - p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset; - - GGML_UNUSED(src2); - GGML_UNUSED(src3); -} - -template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); - - p.a_offset = a_offset; - p.d_offset = d_offset; - - GGML_UNUSED(src1); - GGML_UNUSED(src2); - GGML_UNUSED(src3); -} - -template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_rope_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - p.a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); - p.d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); - - GGML_UNUSED(src1); - GGML_UNUSED(src2); - GGML_UNUSED(src3); -} - -template -static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc) { - VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; - if (src1 != nullptr) { - std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - } - if (src2 != nullptr) { - std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3]; - } - if (src3 != nullptr) { - std::cerr << "), (" << src3 << ", name=" << src3->name << ", type=" << src3->type << ", ne0=" << src3->ne[0] << ", ne1=" << src3->ne[1] << ", ne2=" << src3->ne[2] << ", ne3=" << src3->ne[3] << ", nb0=" << src3->nb[0] << ", nb1=" << src3->nb[1] << ", nb2=" << src3->nb[2] << ", nb3=" << src3->nb[3]; - } - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; - std::cerr << "), " << ggml_op_name(op) << ")"); - GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT - GGML_ASSERT(dst->buffer != nullptr); - const uint64_t ne00 = src0->ne[0]; - const uint64_t ne01 = src0->ne[1]; - const uint64_t ne02 = src0->ne[2]; - const uint64_t ne03 = src0->ne[3]; - - const bool use_src1 = src1 != nullptr; - const uint64_t ne10 = use_src1 ? src1->ne[0] : 0; - const uint64_t ne11 = use_src1 ? src1->ne[1] : 0; - const uint64_t ne12 = use_src1 ? src1->ne[2] : 0; - const uint64_t ne13 = use_src1 ? src1->ne[3] : 0; - - const bool use_src2 = src2 != nullptr; - const bool use_src3 = src3 != nullptr; - - init_pushconst_fastdiv(pc); - - vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op); - - if (pipeline == nullptr) { - std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type); - if (src1 != nullptr) { - std::cerr << " and " << ggml_type_name(src1->type); - } - std::cerr << " to " << ggml_type_name(dst->type) << std::endl; - GGML_ABORT("fatal error"); - } - - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - - vk_subbuffer src0_buf = ggml_vk_tensor_subbuffer(ctx, src0, true); - vk_subbuffer src1_buf = use_src1 ? ggml_vk_tensor_subbuffer(ctx, src1, true) : vk_subbuffer{}; - vk_subbuffer src2_buf = use_src2 ? ggml_vk_tensor_subbuffer(ctx, src2, true) : vk_subbuffer{}; - vk_subbuffer src3_buf = use_src3 ? ggml_vk_tensor_subbuffer(ctx, src3, true) : vk_subbuffer{}; - vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, true); - - // Compute misalignment offset for descriptors and store it in in push constants. - init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, src3, dst); - - std::array elements; - - switch (op) { - case GGML_OP_NORM: - case GGML_OP_RMS_NORM_BACK: - case GGML_OP_L2_NORM: - case GGML_OP_SOFT_MAX: - case GGML_OP_SOFT_MAX_BACK: - case GGML_OP_SUM_ROWS: - case GGML_OP_CUMSUM: - case GGML_OP_MEAN: - case GGML_OP_ARGMAX: - { - const uint32_t nr = ggml_nrows(src0); - if (nr > 262144) { - elements = { 512, 512, CEIL_DIV(nr, 262144) }; - } else if (nr > 512) { - elements = { 512, CEIL_DIV(nr, 512), 1 }; - } else { - elements = { nr, 1, 1 }; - } - } break; - case GGML_OP_SOLVE_TRI: - { - uint32_t nr = (uint32_t)(ne02 * ne03); - if (nr > 262144) { - elements = { 512, 512, CEIL_DIV(nr, 262144) }; - } else if (nr > 512) { - elements = { 512, CEIL_DIV(nr, 512), 1 }; - } else { - elements = { nr, 1, 1 }; - } - } - break; - case GGML_OP_RMS_NORM: - if (ctx->do_add_rms_partials) { - // Run one element per thread, 128 threads per workgroup - elements = { (uint32_t)CEIL_DIV(ne00, 128), 1, 1 }; - } else { - elements = { (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne03 }; - } - break; - - case GGML_OP_SUM: - // We use GGML_OP_SUM_ROWS with 1 row. - elements = { 1, 1, 1 }; - break; - case GGML_OP_GROUP_NORM: - { - const uint32_t num_groups = dst->op_params[0]; - elements = { num_groups * (uint32_t)src0->ne[3], 1, 1 }; - } break; - case GGML_OP_DIAG_MASK_INF: - elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 }; - break; - case GGML_OP_ROPE: - case GGML_OP_ROPE_BACK: - { - uint32_t nrows = (uint32_t)ggml_nrows(src0); - uint32_t z = 1; - if (nrows > ctx->device->properties.limits.maxComputeWorkGroupCount[0]) { - z = CEIL_DIV(nrows, 32768); - nrows = 32768; - } - elements = { nrows, (uint32_t)ne00, z }; - - } break; - case GGML_OP_GET_ROWS: - elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) }; - elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]); - elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]); - break; - case GGML_OP_ARGSORT: - GGML_ASSERT(0); - break; - case GGML_OP_IM2COL: - { - const bool is_2D = dst->op_params[6] == 1; - - const uint32_t IC = src1->ne[is_2D ? 2 : 1]; - - const uint32_t KH = is_2D ? src0->ne[1] : 1; - const uint32_t KW = src0->ne[0]; - - const uint32_t OH = is_2D ? dst->ne[2] : 1; - const uint32_t OW = dst->ne[1]; - - const uint32_t batch = src1->ne[is_2D ? 3 : 2]; - - const uint32_t CHW = IC * KH * KW; - // Cap X workgroups to limit concurrent IC channel reads. - // The shader loops over X to cover the full CHW dimension. - // AMD prefers a lower limit - const uint32_t min_cap = ctx->device->vendor_id == VK_VENDOR_ID_AMD ? 512u : 4096u; - const uint32_t x_elements = std::min(CHW, std::max(min_cap, OW * KH * KW)); - elements = { x_elements, OW, OH * batch }; - elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]); - elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]); - } break; - case GGML_OP_IM2COL_3D: - { - const uint32_t IC = ((const uint32_t *)(dst->op_params))[9]; - - const uint32_t N = ne13 / IC; - - const uint32_t KD = ne02; - const uint32_t KH = ne01; - const uint32_t KW = ne00; - - const uint32_t OD = dst->ne[3] / N; - const uint32_t OH = dst->ne[2]; - const uint32_t OW = dst->ne[1]; - - const uint32_t IC_KD_KH_KW = IC*KD*KH*KW; - const uint32_t N_OD_OH = N*OD*OH; - - elements = { IC_KD_KH_KW, OW, N_OD_OH }; - elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]); - } break; - case GGML_OP_TIMESTEP_EMBEDDING: - { - const uint32_t dim = dst->op_params[0]; - uint32_t half_ceil = (dim + 1) / 2; - elements = { half_ceil, (uint32_t)src0->ne[0], 1 }; - } break; - case GGML_OP_CONV_TRANSPOSE_1D: - { - elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1} - } break; - case GGML_OP_COL2IM_1D: - { - elements = { uint32_t(dst->ne[0]), uint32_t(dst->ne[1]), 1 }; - } break; - case GGML_OP_POOL_2D: - { - const uint32_t N = dst->ne[3]; - const uint32_t OC = dst->ne[2]; - const uint32_t OH = dst->ne[1]; - const uint32_t OW = dst->ne[0]; - elements = { N * OC * OH * OW, 1, 1}; - } break; - case GGML_OP_CONV_2D: - case GGML_OP_CONV_TRANSPOSE_2D: - if constexpr (std::is_same_v) { - const uint32_t NPQ = pc.N * pc.OH * pc.OW; - const vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, pc.Cout, NPQ); - const uint32_t NPQ_blocks = CEIL_DIV(NPQ, vk_conv_block_sizes[shape].NPQ); - - elements = { pc.Cout, NPQ_blocks, 1 }; - if (elements[1] > 512) { - elements[2] = CEIL_DIV(elements[1], 512); - elements[1] = 512; - } - } else { - GGML_ABORT("invalid push constant type for CONV_2D"); - } - break; - case GGML_OP_ADD: - case GGML_OP_SUB: - case GGML_OP_DIV: - case GGML_OP_MUL: - case GGML_OP_ADD1: - case GGML_OP_ARANGE: - case GGML_OP_FILL: - case GGML_OP_SCALE: - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_SIN: - case GGML_OP_COS: - case GGML_OP_LOG: - case GGML_OP_TRI: - case GGML_OP_DIAG: - case GGML_OP_CLAMP: - case GGML_OP_PAD: - case GGML_OP_ROLL: - case GGML_OP_REPEAT: - case GGML_OP_REPEAT_BACK: - case GGML_OP_CPY: - case GGML_OP_CONCAT: - case GGML_OP_UPSCALE: - case GGML_OP_UNARY: - case GGML_OP_GLU: - case GGML_OP_CONV_2D_DW: - { - uint32_t ne = ggml_nelements(dst); - if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) { - // Convert from number of logical elements to 2- or 4-byte units. - ne /= ggml_blck_size(src0->type); - if ((ggml_type_size(src0->type) % 4) == 0) { - ne *= ggml_type_size(src0->type) / 4; - } else { - ne *= ggml_type_size(src0->type) / 2; - } - } - // copy_to_quant has block size of 32, and each thread does QUANT_K elements. - // Splitting into 512x512xZ wouldn't work well since each workgroup does 1024 elements. - // So divide by block size here before splitting into 512x512 groups. - if (op == GGML_OP_CPY && !ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) { - ne = CEIL_DIV(ne, ggml_blck_size(dst->type)); - } - if (ne > 262144) { - elements = { 512, 512, CEIL_DIV(ne, 262144) }; - } else if (ne > 512) { - elements = { 512, CEIL_DIV(ne, 512), 1 }; - } else { - elements = { ne, 1, 1 }; - } - - if (pipeline == ctx->device->pipeline_cpy_transpose_32 || - pipeline == ctx->device->pipeline_cpy_transpose_16) { - // 32x32 tiles - elements[0] = (uint32_t)CEIL_DIV(dst->ne[0], 32); - elements[1] = (uint32_t)CEIL_DIV(dst->ne[1], 32); - elements[2] = (uint32_t)(dst->ne[2]*dst->ne[3]); - elements[0] = std::min(elements[0], ctx->device->properties.limits.maxComputeWorkGroupCount[0]); - elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]); - elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]); - } - } break; - case GGML_OP_ADD_ID: - { - elements = { (uint32_t)ne01, (uint32_t)ne02, 1 }; - } break; - case GGML_OP_SET_ROWS: - { - uint32_t ne = ggml_nelements(src0); - if (ggml_is_quantized(dst->type)) { - // quants run 32 threads each doing QUANT_K elements - ne = CEIL_DIV(ne, 32 * ggml_blck_size(dst->type)); - } else { - // scalar types do one element per thread, running 512 threads - ne = CEIL_DIV(ne, 512); - } - if (ne > 262144) { - elements = { 512, 512, CEIL_DIV(ne, 262144) }; - } else if (ne > 512) { - elements = { 512, CEIL_DIV(ne, 512), 1 }; - } else { - elements = { ne, 1, 1 }; - } - } - break; - case GGML_OP_SSM_CONV: - { - const uint32_t nr = src0->ne[1]; - const uint32_t n_t = dst->ne[1]; - const uint32_t n_s = dst->ne[2]; - elements = { nr, n_t, n_s }; - } - break; - default: - elements = { (uint32_t)ggml_nelements(src0), 1, 1 }; - break; - } - - if (op == GGML_OP_ADD || op == GGML_OP_RMS_NORM) { - vk_subbuffer a_buf = src0_buf; - if (ctx->do_add_rms_partials) { - a_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_add_rms_partials, ctx->prealloc_size_add_rms_partials_offset); - } - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - { src0_buf, src1_buf, dst_buf, a_buf }, pc, elements); - } else if (op == GGML_OP_GLU) { - // Empty src1 is possible in glu, but the shader needs a buffer - vk_subbuffer subbuf1 = use_src1 ? src1_buf : src0_buf; - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc, elements); - } else if (op == GGML_OP_SOFT_MAX) { - // Empty src1 and src2 is possible in soft_max, but the shader needs a buffer - vk_subbuffer subbuf1 = use_src1 ? src1_buf : src0_buf; - vk_subbuffer subbuf2 = use_src2 ? src2_buf : src0_buf; - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, subbuf2, dst_buf }, pc, elements); - } else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) { - // Empty src2 and src3 is possible in rope, but the shader needs a buffer - vk_subbuffer subbuf2 = use_src2 ? src2_buf : src0_buf; - vk_subbuffer subbuf3 = use_src3 ? src3_buf : src0_buf; - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, subbuf2, dst_buf, subbuf3 }, pc, elements); - } else if (op == GGML_OP_IM2COL || op == GGML_OP_IM2COL_3D) { - if (ctx->device->shader_int64 && ctx->device->buffer_device_address) { - // buffer device address path doesn't use dst buffer - dst_buf.size = 1; - } - // im2col uses only src1 and dst buffers - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src1_buf, dst_buf }, pc, elements); - } else if (op == GGML_OP_COUNT_EQUAL) { - // count_equal assumes that destination buffer is initialized with zeroes - ggml_vk_buffer_memset_async(subctx, dst_buf.buffer, dst_buf.offset, 0, dst_buf.size); - ggml_vk_sync_buffers(ctx, subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, dst_buf }, pc, elements); - } else if (op == GGML_OP_OPT_STEP_SGD) { - // OPT_STEP_SGD works on src0, it does not need dst - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf }, pc, elements); - } else if (use_src3) { - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf, src3_buf, dst_buf }, pc, elements); - } else if (use_src2) { - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf, dst_buf }, pc, elements); - } else if (use_src1) { - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, dst_buf }, pc, elements); - } else { - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, dst_buf }, pc, elements); - } -} - -static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t src1_type_size = ggml_type_size(src1->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GET_ROWS, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, 0, - }); -} - -static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t src1_type_size = ggml_type_size(src1->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - int nb1 = dst->op_params[0] / src0_type_size; // 4 bytes of float32 - int nb2 = dst->op_params[1] / src0_type_size; // 4 bytes of float32 - int nb3 = dst->op_params[2] / src0_type_size; // 4 bytes of float32 - int offset = dst->op_params[3] / src0_type_size; // offset in bytes - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, dst->op, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)nb3, - (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)nb3, - 0, - 0.0f, 0.0f, offset, - }); -} - -static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) { - const ggml_tensor *first_node = cgraph->nodes[node_idx]; - const ggml_tensor *dst = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; - - // Make a list of all the tensors used by the op. - // Last element of the list is the dest tensor. - const ggml_tensor *tensors[MAX_PARAMETER_COUNT]; - uint32_t num_srcs = ctx->num_additional_fused_ops + 2; - uint32_t num_tensors = num_srcs + 1; - GGML_ASSERT(num_tensors + ctx->do_add_rms_partials <= MAX_PARAMETER_COUNT); - - tensors[0] = first_node->src[0]; - tensors[1] = first_node->src[1]; - for (int32_t i = 0; i < ctx->num_additional_fused_ops; ++i) { - // check whether the previous result is src[0] or src[1] - if (cgraph->nodes[node_idx + i] == cgraph->nodes[node_idx + i + 1]->src[0]) { - tensors[i+2] = cgraph->nodes[node_idx + i + 1]->src[1]; - } else { - tensors[i+2] = cgraph->nodes[node_idx + i + 1]->src[0]; - } - } - tensors[num_srcs] = dst; - - vk_op_multi_add_push_constants pc; - pc.ne20 = (uint32_t)dst->ne[0]; - pc.ne21 = (uint32_t)dst->ne[1]; - pc.ne22 = (uint32_t)dst->ne[2]; - pc.ne23 = (uint32_t)dst->ne[3]; - - for (uint32_t i = 0; i < num_tensors; ++i) { - const ggml_tensor *t = tensors[i]; - pc.nb[i][0] = (uint32_t)t->nb[0] / sizeof(float); - pc.nb[i][1] = (uint32_t)t->nb[1] / sizeof(float); - pc.nb[i][2] = (uint32_t)t->nb[2] / sizeof(float); - pc.nb[i][3] = (uint32_t)t->nb[3] / sizeof(float); - } - pc.rms_partials = ctx->do_add_rms_partials; - - vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, tensors[0], tensors[1], nullptr, dst, dst->op); - - if (pipeline == nullptr) { - std::cerr << "ggml_vulkan: Error: Missing multi_add"; - GGML_ABORT("fatal error"); - } - - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - - ggml_backend_vk_buffer_context * buf_ctx[MAX_PARAMETER_COUNT]; - vk_buffer buf[MAX_PARAMETER_COUNT]; - size_t offset[MAX_PARAMETER_COUNT]; - bool uma[MAX_PARAMETER_COUNT]; - - for (uint32_t i = 0; i < num_tensors; ++i) { - buf_ctx[i] = (ggml_backend_vk_buffer_context *)tensors[i]->buffer->context; - buf[i] = nullptr; - offset[i] = 0; - uma[i] = false; - - if (ctx->device->uma) { - ggml_vk_host_get(ctx->device, tensors[i]->data, buf[i], offset[i]); - uma[i] = buf[i] != nullptr; - } - if (!uma[i]) { - buf[i] = buf_ctx[i]->dev_buffer; - offset[i] = vk_tensor_offset(tensors[i]) + tensors[i]->view_offs; - } - GGML_ASSERT(buf[i] != nullptr); - } - // If any remaining descriptors are unused, just point them at src[0] - for (uint32_t i = num_tensors; i < MAX_PARAMETER_COUNT; ++i) { - buf[i] = buf[0]; - offset[i] = 0; - } - if (ctx->do_add_rms_partials) { - buf[num_tensors] = ctx->prealloc_add_rms_partials; - offset[num_tensors] = ctx->prealloc_size_add_rms_partials_offset; - } - - std::array elements; - - uint32_t ne = ggml_nelements(dst); - if (ne > 262144) { - elements = { 512, 512, CEIL_DIV(ne, 262144) }; - } else if (ne > 512) { - elements = { 512, CEIL_DIV(ne, 512), 1 }; - } else { - elements = { ne, 1, 1 }; - } - - static_assert(MAX_PARAMETER_COUNT == 12); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - { - ggml_vk_subbuffer(ctx, buf[0], offset[0]), - ggml_vk_subbuffer(ctx, buf[1], offset[1]), - ggml_vk_subbuffer(ctx, buf[2], offset[2]), - ggml_vk_subbuffer(ctx, buf[3], offset[3]), - ggml_vk_subbuffer(ctx, buf[4], offset[4]), - ggml_vk_subbuffer(ctx, buf[5], offset[5]), - ggml_vk_subbuffer(ctx, buf[6], offset[6]), - ggml_vk_subbuffer(ctx, buf[7], offset[7]), - ggml_vk_subbuffer(ctx, buf[8], offset[8]), - ggml_vk_subbuffer(ctx, buf[9], offset[9]), - ggml_vk_subbuffer(ctx, buf[10], offset[10]), - ggml_vk_subbuffer(ctx, buf[11], offset[11]), - }, pc, elements); -} - -static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t src1_type_size = ggml_type_size(src1->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_ADD, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, ctx->do_add_rms_partials, - }); -} - -static void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t src1_type_size = ggml_type_size(src1->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SUB, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, 0, - }); -} - -static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t src1_type_size = ggml_type_size(src1->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_MUL, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, 0, - }); -} - -static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t src1_type_size = ggml_type_size(src1->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_DIV, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, 0, - }); -} - -static void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t src1_type_size = ggml_type_size(src1->type); - const uint32_t src2_type_size = ggml_type_size(src2->type); - - ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_ADD_ID, { - (uint32_t)dst->ne[0], - (uint32_t)dst->ne[1], - (uint32_t)src0->nb[1] / src0_type_size, - (uint32_t)src0->nb[2] / src0_type_size, - (uint32_t)src1->nb[1] / src1_type_size, - (uint32_t)src2->nb[1] / src2_type_size, - }); -} - -static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, int version) { - GGML_ASSERT(version == 6 || version == 7); - int num_srcs = version == 6 ? 6 : 7; - - for (int i = 0; i < num_srcs; i++) { - GGML_ASSERT(!ggml_is_quantized(dst->src[i]->type)); - } - - GGML_ASSERT(dst->buffer != nullptr); - - vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, dst->src[0], dst->src[1], dst->src[2], dst, dst->op); - GGML_ASSERT(pipeline != nullptr); - - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - - vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); - vk_subbuffer src_buf[7] = {}; - for (int i = 0; i < num_srcs; i++) { - src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]); - } - - std::array elements = { - (uint32_t)(pc.B * pc.H), - 1, - 1 - }; - - if (version == 6) { - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], dst_buf}, - pc, elements); - } else if (version == 7) { - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], src_buf[6], dst_buf}, - pc, elements); - } else { - // shouldn't happen - GGML_ASSERT(false); - } -} - -static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { - const size_t seq_length = dst->src[0]->ne[2]; - const size_t n_embed = dst->ne[0]; - const size_t n_heads = dst->src[0]->ne[1]; - const size_t n_seqs = dst->src[5]->ne[1]; - - ggml_vk_op_f32_wkv( - ctx, subctx, dst, - { - (uint32_t)n_seqs, - (uint32_t)seq_length, - (uint32_t)n_embed, - (uint32_t)n_heads, - }, - 6 - ); -} - -static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { - const size_t seq_length = dst->src[0]->ne[2]; - const size_t n_embed = dst->ne[0]; - const size_t n_heads = dst->src[0]->ne[1]; - const size_t n_seqs = dst->src[6]->ne[1]; - - ggml_vk_op_f32_wkv( - ctx, subctx, dst, - { - (uint32_t)n_seqs, - (uint32_t)seq_length, - (uint32_t)n_embed, - (uint32_t)n_heads, - }, - 7 - ); -} - -static void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { - const ggml_tensor * src_q = dst->src[0]; - const ggml_tensor * src_v = dst->src[2]; - const ggml_tensor * src_beta = dst->src[4]; - - GGML_ASSERT(dst->buffer != nullptr); - - const uint32_t S_v = (uint32_t)src_v->ne[0]; - const uint32_t H = (uint32_t)src_v->ne[1]; - const uint32_t n_tokens = (uint32_t)src_v->ne[2]; - const uint32_t n_seqs = (uint32_t)src_v->ne[3]; - - // K (snapshot slot count) is an op param; state holds s0 only [S_v, S_v, H, n_seqs]. - const uint32_t K = (uint32_t)ggml_get_op_params_i32(dst, 0); - - const uint32_t s_off = S_v * H * n_tokens * n_seqs; - - vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, dst->src[0], dst->src[1], dst->src[2], dst, dst->op); - GGML_ASSERT(pipeline != nullptr); - - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - - vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); - vk_subbuffer src_buf[6] = {}; - for (int i = 0; i < 6; i++) { - src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]); - } - - const uint32_t sq1 = (uint32_t)(src_q->nb[1] / sizeof(float)); - const uint32_t sq2 = (uint32_t)(src_q->nb[2] / sizeof(float)); - const uint32_t sq3 = (uint32_t)(src_q->nb[3] / sizeof(float)); - const uint32_t sv1 = (uint32_t)(src_v->nb[1] / sizeof(float)); - const uint32_t sv2 = (uint32_t)(src_v->nb[2] / sizeof(float)); - const uint32_t sv3 = (uint32_t)(src_v->nb[3] / sizeof(float)); - const uint32_t sb1 = (uint32_t)(src_beta->nb[1] / sizeof(float)); - const uint32_t sb2 = (uint32_t)(src_beta->nb[2] / sizeof(float)); - const uint32_t sb3 = (uint32_t)(src_beta->nb[3] / sizeof(float)); - - const uint32_t neq1 = (uint32_t)src_q->ne[1]; - const uint32_t rq3 = (uint32_t)(src_v->ne[3] / src_q->ne[3]); - - const float scale = 1.0f / sqrtf((float)S_v); - const vk_op_gated_delta_net_push_constants pc = { - H, n_tokens, n_seqs, s_off, - sq1, sq2, sq3, - sv1, sv2, sv3, - sb1, sb2, sb3, - neq1, rq3, - scale, - K - }; - - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], dst_buf}, - pc, { H, n_seqs, S_v }); -} - -static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - const ggml_tensor * src2 = dst->src[2]; - const ggml_tensor * src3 = dst->src[3]; - const ggml_tensor * src4 = dst->src[4]; - const ggml_tensor * src5 = dst->src[5]; - - GGML_ASSERT(dst->buffer != nullptr); - - const uint32_t head_dim = src0->ne[1]; - const uint32_t n_head = src1->ne[1]; - const uint32_t n_group = src4->ne[1]; - const uint32_t n_tok = src1->ne[2]; - const uint32_t n_seq = src1->ne[3]; - - bool is_mamba2 = (src3->nb[1] == sizeof(float)); - GGML_ASSERT(is_mamba2); - - vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, dst->op); - GGML_ASSERT(pipeline != nullptr); - - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - - const int64_t s_off = ggml_nelements(src1) * sizeof(float); - - const vk_op_ssm_scan_push_constants pc = { - (uint32_t)src0->nb[2], (uint32_t)src0->nb[3], - (uint32_t)src1->nb[2], (uint32_t)src1->nb[3], - (uint32_t)src2->nb[1], (uint32_t)src2->nb[2], - (uint32_t)src3->nb[1], - (uint32_t)src4->nb[2], (uint32_t)src4->nb[3], - (uint32_t)src5->nb[2], (uint32_t)src5->nb[3], - (uint32_t)s_off, - n_head, head_dim, n_group, n_tok - }; - - vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); - vk_subbuffer src_buf[7] = {}; - for (int i = 0; i < 7 && dst->src[i] != nullptr; i++) { - src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]); - } - - std::array elements; - - const uint32_t d_state = src0->ne[0]; - uint32_t num_subgroups = d_state / ctx->device->subgroup_size; - const uint32_t num_workgroups_x = CEIL_DIV(n_head * head_dim, num_subgroups); - const uint32_t num_workgroups_y = n_seq; - elements = { num_workgroups_x, num_workgroups_y, 1 }; - - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], src_buf[6], dst_buf}, - pc, elements); -} - -static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { - ggml_tensor * conv = cgraph->nodes[node_idx]; - const ggml_tensor * src0 = conv->src[0]; - const ggml_tensor * src1 = conv->src[1]; - - // Pick the destination tensor (last node in the fused chain) and the optional bias. - // Fusion modes: 0 = ssm_conv, 1 = ssm_conv+silu, 2 = ssm_conv+add(bias)+silu. - ggml_tensor * dst = conv; - const ggml_tensor * bias = nullptr; - - if (ctx->num_additional_fused_ops == 1) { - dst = cgraph->nodes[node_idx + 1]; // silu - } else if (ctx->num_additional_fused_ops == 2) { - ggml_tensor * add = cgraph->nodes[node_idx + 1]; - bias = (add->src[0] == conv) ? add->src[1] : add->src[0]; - dst = cgraph->nodes[node_idx + 2]; // silu - } - - // The shader always declares 4 bindings; bind src0 as a dummy when bias isn't fused. - const ggml_tensor * src2 = bias ? bias : src0; - - ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SSM_CONV, { - (uint32_t)src0->nb[1], (uint32_t)src0->nb[2], - (uint32_t)src1->nb[1], - (uint32_t)dst->nb[0], (uint32_t)dst->nb[1], (uint32_t)dst->nb[2], - (uint32_t)src1->ne[0], - (uint32_t)src0->ne[0], - (uint32_t)src0->ne[1], - (uint32_t)dst->ne[1], - (uint32_t)dst->ne[2], - }); -} - -static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_push_constants&& pc) { - const ggml_tensor * x = dst->src[0]; - const ggml_tensor * g = dst->src[1]; - const ggml_tensor * gm = dst->src[2]; - const ggml_tensor * gv = dst->src[3]; - const ggml_tensor * p = dst->src[4]; - - GGML_ASSERT(x->type == GGML_TYPE_F32); - GGML_ASSERT(g->type == GGML_TYPE_F32); - GGML_ASSERT(gm->type == GGML_TYPE_F32); - GGML_ASSERT(gv->type == GGML_TYPE_F32); - GGML_ASSERT(p->type == GGML_TYPE_F32); - GGML_ASSERT(dst->buffer != nullptr); - GGML_ASSERT(ggml_is_contiguous(x)); - GGML_ASSERT(ggml_is_contiguous(g)); - GGML_ASSERT(ggml_is_contiguous(gm)); - GGML_ASSERT(ggml_is_contiguous(gv)); - GGML_ASSERT(ggml_is_contiguous(p)); - GGML_ASSERT(ggml_are_same_shape(x, g)); - GGML_ASSERT(ggml_are_same_shape(x, gm)); - GGML_ASSERT(ggml_are_same_shape(x, gv)); - GGML_ASSERT(ggml_nelements(p) == 7); - - vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, g, gm, gv, dst, GGML_OP_OPT_STEP_ADAMW); - GGML_ASSERT(pipeline != nullptr); - - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - - vk_subbuffer x_buf = ggml_vk_tensor_subbuffer(ctx, x); - vk_subbuffer g_buf = ggml_vk_tensor_subbuffer(ctx, g); - vk_subbuffer gm_buf = ggml_vk_tensor_subbuffer(ctx, gm); - vk_subbuffer gv_buf = ggml_vk_tensor_subbuffer(ctx, gv); - vk_subbuffer p_buf = ggml_vk_tensor_subbuffer(ctx, p); - - std::array elements = { (uint32_t)ggml_nelements(x), 1, 1 }; - - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - {x_buf, g_buf, gm_buf, gv_buf, p_buf}, - pc, elements); -} - -static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { - const size_t n = ggml_nelements(dst->src[0]); - - ggml_vk_op_f32_opt_step_adamw( - ctx, subctx, dst, - { (uint32_t)n, 0, 0.0f, 0.0f, 0.0f, 0.0f } - ); -} - -static void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { - const size_t n = ggml_nelements(dst->src[0]); - - ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f, 0.0f, 0.0f }); -} - -static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - int * op_params = (int *)dst->op_params; - - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t src1_type_size = ggml_type_size(src1->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONCAT, { - (uint32_t)ggml_nelements(dst), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, op_params[0], - }); -} - -static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0); - - GGML_TENSOR_UNARY_OP_LOCALS - - float sf0 = (float)ne0 / ne00; - float sf1 = (float)ne1 / ne01; - float sf2 = (float)ne2 / ne02; - float sf3 = (float)ne3 / ne03; - float pixel_offset = 0.5f; - - if (mode & GGML_SCALE_FLAG_ALIGN_CORNERS) { - sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0; - sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1; - pixel_offset = 0.0f; - } - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UPSCALE, { - (uint32_t)ggml_nelements(dst), 0, 0, - (uint32_t)ne00, (uint32_t)ne01, - (uint32_t)nb00 / src0_type_size, (uint32_t)nb01 / src0_type_size, (uint32_t)nb02 / src0_type_size, (uint32_t)nb03 / src0_type_size, - (uint32_t)ne0, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, - sf0, sf1, sf2, sf3, pixel_offset - }); -} - -static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); - p.param1 = ggml_get_op_params_f32(dst, 0); - p.param2 = ggml_get_op_params_f32(dst, 1); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p)); -} - -static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst)); -} - -static void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQRT, vk_op_unary_push_constants_init(src0, dst)); -} - -static void ggml_vk_add1(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t src1_type_size = ggml_type_size(src1->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_ADD1, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, 0, - }); -} - -static void ggml_vk_arange(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { - VK_LOG_DEBUG("ggml_vk_arange(dst=" << dst << ", ne=" << ggml_nelements(dst) << ")"); - - vk_op_push_constants pc = { - (uint32_t)ggml_nelements(dst), - 1, - ggml_get_op_params_f32(dst, 0), - ggml_get_op_params_f32(dst, 2), - 0.0f, 0.0f, - }; - - vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, dst, GGML_OP_ARANGE); - GGML_ASSERT(pipeline != nullptr); - - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, false); - - std::array elements = { (uint32_t)ggml_nelements(dst), 1, 1 }; - - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { dst_buf }, pc, elements); -} - -static void ggml_vk_fill(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { - VK_LOG_DEBUG("ggml_vk_fill(dst=" << dst << ", ne=" << ggml_nelements(dst) << ")"); - - vk_op_push_constants pc = { - (uint32_t)ggml_nelements(dst), - 1, - ggml_get_op_params_f32(dst, 0), - 0.0f, - 0.0f, 0.0f, - }; - - vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, dst, GGML_OP_FILL); - GGML_ASSERT(pipeline != nullptr); - - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, false); - - std::array elements = { (uint32_t)ggml_nelements(dst), 1, 1 }; - - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { dst_buf }, pc, elements); -} - -static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst)); -} - -static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst)); -} - -static void ggml_vk_log(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LOG, vk_op_unary_push_constants_init(src0, dst)); -} - -static void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); - p.param1 = ggml_get_op_params_f32(dst, 0); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p)); -} - -static void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG, std::move(p)); -} - -static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); - p.param1 = ggml_get_op_params_f32(dst, 0); - p.param2 = ggml_get_op_params_f32(dst, 1); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p)); -} - -static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - vk_op_pad_push_constants p = vk_op_pad_push_constants_init(src0, dst); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p)); -} - -static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - const int32_t s0 = ggml_get_op_params_i32(dst, 0); - const int32_t s1 = ggml_get_op_params_i32(dst, 1); - const int32_t s2 = ggml_get_op_params_i32(dst, 2); - const int32_t s3 = ggml_get_op_params_i32(dst, 3); - const uint32_t s01_packed = ((s0 + 0x8000) << 16) | (s1 + 0x8000); - const uint32_t s23_packed = ((s2 + 0x8000) << 16) | (s3 + 0x8000); - - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); - memcpy(&p.param1, &s01_packed, sizeof(float)); - memcpy(&p.param2, &s23_packed, sizeof(float)); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p)); -} - -static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p)); -} - -static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p)); -} - -static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - uint32_t ne = (uint32_t)ggml_nelements(src0); - if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) { - // Convert from number of logical elements to 2- or 4-byte units. - ne /= ggml_blck_size(src0->type); - if ((ggml_type_size(src0->type) % 4) == 0) { - ne *= ggml_type_size(src0->type) / 4; - } else { - ne *= ggml_type_size(src0->type) / 2; - } - } - - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p)); -} - -static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t src1_type_size = ggml_type_size(src1->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - // Skip empty skip_rows operations. For most ops the empty check at the start - // of ggml_vk_build_graph is sufficient, but set_rows can have a nonempty dst - // with empty srcs. - if (ggml_is_empty(src0) || ggml_is_empty(src1)) { - return; - } - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SET_ROWS, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, 0, - }); -} - -static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f }); -} - -static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - float * op_params = (float *)dst->op_params; - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f, 0.0f, 0.0f }); -} - -static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - const int * int_op_params = (const int *)dst->op_params; - const float * float_op_params = (const float *)dst->op_params; - - const uint32_t num_groups = int_op_params[0]; - const float eps = float_op_params[1]; - const uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f, 0.0f, 0.0f }); -} - -static uint32_t ggml_vk_rms_num_partials(ggml_backend_vk_context * ctx, const ggml_tensor *node) { - const uint32_t ne = (uint32_t)node->ne[0]; - const uint32_t denom = ctx->device->pipeline_add_rms[0][0][0]->wg_denoms[0]; - const uint32_t num_partials = CEIL_DIV(ne, denom); - return num_partials; -} - -static uint32_t ggml_vk_rms_partials_size(ggml_backend_vk_context * ctx, const ggml_tensor *node) { - const uint32_t num_partials = ggml_vk_rms_num_partials(ctx, node); - const uint32_t num_bytes = ROUNDUP_POW2(num_partials * sizeof(uint32_t), ctx->device->partials_binding_alignment); - return num_bytes; -} - -static vk_op_rope_push_constants ggml_vk_make_rope_constants(const ggml_tensor *dst, const ggml_tensor *src0, const bool has_ff, bool backprop, const uint32_t set_rows_stride) { - const int n_dims = ((const int32_t *) dst->op_params)[1]; - const int mode = ((const int32_t *) dst->op_params)[2]; - // const int n_ctx = ((const int32_t *) dst->op_params)[3]; - const int n_ctx_orig = ((const int32_t *) dst->op_params)[4]; - const float freq_base = ((const float *) dst->op_params)[5]; - const float freq_scale = ((const float *) dst->op_params)[6]; - const float ext_factor = ((const float *) dst->op_params)[7]; - const float attn_factor = ((const float *) dst->op_params)[8]; - const float beta_fast = ((const float *) dst->op_params)[9]; - const float beta_slow = ((const float *) dst->op_params)[10]; - int sections[4] {}; - if (mode & GGML_ROPE_TYPE_MROPE) { - memcpy(sections, (const int32_t *) dst->op_params + 11, sizeof(int)*4); - } - - const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; - - float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - - const float theta_scale = powf(freq_base, -2.0f/n_dims); - - uint32_t nb01 = src0->nb[1] / ggml_type_size(src0->type); - uint32_t nb02 = src0->nb[2] / ggml_type_size(src0->type); - uint32_t nb03 = src0->nb[3] / ggml_type_size(src0->type); - - uint32_t nb11 = dst->nb[1] / ggml_type_size(dst->type); - uint32_t nb12 = dst->nb[2] / ggml_type_size(dst->type); - uint32_t nb13 = dst->nb[3] / ggml_type_size(dst->type); - - vk_op_rope_push_constants rope { - (uint32_t)mode, (uint32_t)ggml_nrows(src0), (uint32_t)n_dims, freq_scale, - freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale, has_ff, - { sections[0], sections[1], sections[2], sections[3] }, is_imrope, backprop, set_rows_stride, - - (uint32_t)src0->ne[0], - (uint32_t)src0->ne[1], - (uint32_t)src0->ne[2], - nb01, nb02, nb03, - nb11, nb12, nb13, - 0, 0, // a_offset, d_offset filled in by init_pushconst_tensor_offsets - }; - - return rope; -} - -static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, float * op_params) { - ggml_tensor * dst; - const ggml_tensor * src0; - const ggml_tensor * src1; - - if (ctx->num_additional_fused_ops > 0) { - // fused rms_norm + mul - ggml_tensor *mul = cgraph->nodes[node_idx + 1]; - ggml_tensor *other_src = mul->src[0] == cgraph->nodes[node_idx + 0] ? mul->src[1] : mul->src[0]; - dst = mul; - src0 = cgraph->nodes[node_idx]->src[0]; - src1 = other_src; - } else { - dst = cgraph->nodes[node_idx]; - src0 = src1 = dst->src[0]; - } - - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t src1_type_size = ggml_type_size(src1->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - uint32_t param3 = ctx->do_add_rms_partials ? ggml_vk_rms_num_partials(ctx, dst) : 0; - - vk_op_binary_push_constants bin { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - op_params[0], 0.0f, (int32_t)param3, - }; - - // more than one fused op means rms_norm+mul+rope - if (ctx->num_additional_fused_ops > 1) { - static constexpr uint32_t max_tensors = 7; - const ggml_tensor *tensors[max_tensors] {}; - - ggml_tensor *rms = cgraph->nodes[node_idx + 0]; - ggml_tensor *mul = cgraph->nodes[node_idx + 1]; - ggml_tensor *rope = cgraph->nodes[node_idx + 2]; - - ggml_tensor *other_src = mul->src[0] == rms ? mul->src[1] : mul->src[0]; - - bool do_set_rows = ctx->num_additional_fused_ops == 4; - - tensors[0] = rms->src[0]; - tensors[1] = other_src; - tensors[2] = mul; - tensors[3] = rope->src[1]; // pos - tensors[4] = rope->src[2]; // ff - tensors[5] = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; // dst - tensors[6] = do_set_rows ? tensors[5]->src[1] : nullptr; - const uint32_t set_rows_stride = do_set_rows ? tensors[5]->nb[1] / ggml_type_size(tensors[5]->type) : 0; - - vk_op_rms_norm_mul_rope_push_constants pc; - pc.bin = bin; - pc.rope = ggml_vk_make_rope_constants(rope, rope->src[0], tensors[4] != nullptr, false, set_rows_stride); - - vk_pipeline pipeline = tensors[5]->type == GGML_TYPE_F16 ? ctx->device->pipeline_rms_norm_mul_rope_f32_f16 : ctx->device->pipeline_rms_norm_mul_rope_f32_f32; - - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - - ggml_backend_vk_buffer_context * buf_ctx[max_tensors]; - vk_buffer buf[max_tensors]; - size_t offset[max_tensors]; - bool uma[max_tensors]; - - for (uint32_t i = 0; i < max_tensors; ++i) { - if (!tensors[i]) { - // If any remaining descriptors are unused, just point them at src[0] - buf[i] = buf[0]; - offset[i] = 0; - continue; - } - buf_ctx[i] = (ggml_backend_vk_buffer_context *)tensors[i]->buffer->context; - buf[i] = nullptr; - offset[i] = 0; - uma[i] = false; - - if (ctx->device->uma) { - ggml_vk_host_get(ctx->device, tensors[i]->data, buf[i], offset[i]); - uma[i] = buf[i] != nullptr; - } - if (!uma[i]) { - buf[i] = buf_ctx[i]->dev_buffer; - offset[i] = vk_tensor_offset(tensors[i]) + tensors[i]->view_offs; - } - GGML_ASSERT(buf[i] != nullptr); - } - - // a_offset is unused (the fused path reads from shared memory), but the rope/set_rows dst can be misaligned. - // Round the binding offset down to the storage buffer alignment; the in-element shift goes in pc.rope.d_offset. - pc.rope.d_offset = get_misalign_bytes(ctx, tensors[5]) / ggml_type_size(tensors[5]->type); - offset[5] &= ~(size_t(ctx->device->properties.limits.minStorageBufferOffsetAlignment) - 1); - - std::array elements; - elements = { (uint32_t)rms->src[0]->ne[1], (uint32_t)rms->src[0]->ne[2], (uint32_t)rms->src[0]->ne[3] }; - - static_assert(max_tensors == 7); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - { - ggml_vk_subbuffer(ctx, buf[0], offset[0]), - ggml_vk_subbuffer(ctx, buf[1], offset[1]), - ggml_vk_subbuffer(ctx, buf[2], offset[2]), - ggml_vk_subbuffer(ctx, buf[3], offset[3]), - ggml_vk_subbuffer(ctx, buf[4], offset[4]), - ggml_vk_subbuffer(ctx, buf[5], offset[5]), - ggml_vk_subbuffer(ctx, buf[6], offset[6]), - }, pc, elements); - } else { - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM, std::move(bin)); - } - - if (ctx->do_add_rms_partials_offset_calculation) { - ctx->prealloc_size_add_rms_partials_offset += ggml_vk_rms_partials_size(ctx, src0); - ctx->do_add_rms_partials = false; - ctx->do_add_rms_partials_offset_calculation = false; - } -} - -static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f, 0.0f, 0.0f }); -} - -static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - const float * op_params = (const float *)dst->op_params; - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); - p.param1 = op_params[0]; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, std::move(p)); -} - -static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, vk_op_unary_push_constants_init(src0, dst)); -} - -static void ggml_vk_xielu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - float * op_params = (float *)dst->op_params; - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); - p.param1 = op_params[1]; - p.param2 = op_params[2]; - p.param3 = op_params[3]; - p.param4 = op_params[4]; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, std::move(p)); -} - -static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const float * op_params_f = (const float *)dst->op_params; - - const bool swapped = (bool)dst->op_params[1]; - const bool split = src1 != nullptr; - const float alpha = op_params_f[2]; - const float limit = op_params_f[3]; - - if (!split) { - GGML_ASSERT(src0->ne[0] / 2 == dst->ne[0]); - } else { - GGML_ASSERT(src0->ne[0] == src1->ne[0]); - GGML_ASSERT(src0->ne[0] == dst->ne[0]); - GGML_ASSERT(src0->type == src1->type); - } - - const uint32_t mode = split ? 2 : (swapped ? 1 : 0); - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t src1_type_size = split ? ggml_type_size(src1->type) : src0_type_size; - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GLU, - { - (uint32_t)ggml_nelements(dst), - (uint32_t)src0->ne[0], - (uint32_t)dst->ne[0], - mode, - alpha, - limit, - (uint32_t)(src0->nb[0] / src0_type_size), - (uint32_t)(src0->nb[1] / src0_type_size), - (uint32_t)(src0->nb[2] / src0_type_size), - (uint32_t)(src0->nb[3] / src0_type_size), - (uint32_t)((split ? src1->nb[0] : src0->nb[0]) / src1_type_size), - (uint32_t)((split ? src1->nb[1] : src0->nb[1]) / src1_type_size), - (uint32_t)((split ? src1->nb[2] : src0->nb[2]) / src1_type_size), - (uint32_t)((split ? src1->nb[3] : src0->nb[3]) / src1_type_size), - (uint32_t)(dst->nb[0] / dst_type_size), - (uint32_t)(dst->nb[1] / dst_type_size), - (uint32_t)(dst->nb[2] / dst_type_size), - (uint32_t)(dst->nb[3] / dst_type_size), - (uint32_t)dst->ne[1], - (uint32_t)dst->ne[2], - 0, - 0, 0, 0, 0, 0, 0, - }); -} - -static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - int32_t * op_params = (int32_t *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }); -} - -static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { - float * op_params = (float *)dst->op_params; - - float scale = op_params[0]; - float max_bias = op_params[1]; - - const uint32_t ncols = (uint32_t)src0->ne[0]; - const uint32_t nrows_x = (uint32_t)ggml_nrows(src0); - const uint32_t nrows_y = (uint32_t)src0->ne[1]; - - const uint32_t ne12 = src1 ? (uint32_t)(src1->ne[2]) : 0u; - const uint32_t ne13 = src1 ? (uint32_t)(src1->ne[3]) : 0u; - const uint32_t nb11 = src1 ? (uint32_t)(src1->nb[1] / src1->nb[0]) : 0u; - const uint32_t nb12 = src1 ? (uint32_t)(src1->nb[2] / src1->nb[0]) : 0u; - const uint32_t nb13 = src1 ? (uint32_t)(src1->nb[3] / src1->nb[0]) : 0u; - - const uint32_t n_head_kv = src0->ne[2]; - const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); - - const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - - vk_op_soft_max_push_constants pc { - ncols, - src1 != nullptr ? nrows_y : (uint32_t)0, - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], - ne12, ne13, - nb11, nb12, nb13, - scale, max_bias, - m0, m1, - n_head_log2, - nrows_x, - src2 != nullptr - }; - - if (ncols <= 16384) { - ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, std::move(pc)); - } else { - - vk_subbuffer buf_a = ggml_vk_tensor_subbuffer(ctx, src0); - vk_subbuffer buf_b = src1 ? ggml_vk_tensor_subbuffer(ctx, src1) : buf_a; - vk_subbuffer buf_c = src2 ? ggml_vk_tensor_subbuffer(ctx, src2) : buf_a; - vk_subbuffer buf_d = ggml_vk_tensor_subbuffer(ctx, dst); - - uint32_t elems_per_wg = 128 * 4; - uint32_t num_wgs = CEIL_DIV(ncols, elems_per_wg); - size_t tmp_size = num_wgs * nrows_x * sizeof(float); - - if (ctx->prealloc_size_x < tmp_size) { - ctx->prealloc_size_x = tmp_size; - ggml_vk_preallocate_buffers(ctx, subctx); - } - if (ctx->prealloc_size_y < tmp_size) { - ctx->prealloc_size_y = tmp_size; - ggml_vk_preallocate_buffers(ctx, subctx); - } - if (ctx->prealloc_x_need_sync || ctx->prealloc_y_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - - vk_subbuffer buf_x = { ctx->prealloc_x, 0, tmp_size }; - vk_subbuffer buf_y = { ctx->prealloc_y, 0, tmp_size }; - - std::array elements = { num_wgs, nrows_x, 1 }; - - vk_pipeline pipeline1 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large1_f32_f16 : ctx->device->pipeline_soft_max_large1_f32; - vk_pipeline pipeline2 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large2_f32_f16 : ctx->device->pipeline_soft_max_large2_f32; - vk_pipeline pipeline3 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large3_f32_f16 : ctx->device->pipeline_soft_max_large3_f32; - - ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1); - ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1); - ggml_pipeline_request_descriptor_sets(ctx, pipeline3, 1); - - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); - ggml_vk_sync_buffers(ctx, subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); - ggml_vk_sync_buffers(ctx, subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline3, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); - - ctx->prealloc_x_need_sync = true; - ctx->prealloc_y_need_sync = true; - } -} - -static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1], 0.0f, 0.0f }); -} - -static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) { - topk_moe_mode mode = ctx->fused_topk_moe_mode; - ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0]; - ggml_tensor * bias = (mode == TOPK_MOE_SIGMOID_NORM_BIAS) ? cgraph->nodes[node_idx + 2]->src[1] : logits; - ggml_tensor * weights = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; - ggml_tensor * ids = (mode == TOPK_MOE_SIGMOID_NORM_BIAS) ? cgraph->nodes[node_idx + 4] : - (mode == TOPK_MOE_LATE_SOFTMAX) ? cgraph->nodes[node_idx + 1] : - cgraph->nodes[node_idx + 3]; - - GGML_ASSERT(logits->type == GGML_TYPE_F32); - GGML_ASSERT(bias->type == GGML_TYPE_F32); - GGML_ASSERT(weights->type == GGML_TYPE_F32); - GGML_ASSERT(ids->type == GGML_TYPE_I32); - - const int n_experts = logits->ne[0]; - const int n_rows = logits->ne[1]; - const int n_expert_used = weights->ne[1]; - - GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts); - - vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, cgraph->nodes[node_idx], GGML_OP_SOFT_MAX); - - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - - vk_subbuffer logits_buf = ggml_vk_tensor_subbuffer(ctx, logits); - vk_subbuffer bias_buf = ggml_vk_tensor_subbuffer(ctx, bias); - vk_subbuffer weights_buf = ggml_vk_tensor_subbuffer(ctx, weights); - vk_subbuffer ids_buf = ggml_vk_tensor_subbuffer(ctx, ids); - - vk_op_topk_moe_push_constants pc {}; - pc.n_rows = n_rows; - pc.n_experts_push = n_experts; - pc.n_expert_used = n_expert_used; - pc.clamp_min = -std::numeric_limits::infinity(); - pc.clamp_max = std::numeric_limits::infinity(); - if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) { - ggml_tensor * clamp = cgraph->nodes[node_idx + 7]; - GGML_ASSERT(clamp->op == GGML_OP_CLAMP); - pc.clamp_min = ggml_get_op_params_f32(clamp, 0); - pc.clamp_max = ggml_get_op_params_f32(clamp, 1); - } - if (mode == TOPK_MOE_SIGMOID_NORM_BIAS) { - ggml_tensor * clamp = cgraph->nodes[node_idx + 8]; - GGML_ASSERT(clamp->op == GGML_OP_CLAMP); - pc.clamp_min = ggml_get_op_params_f32(clamp, 0); - pc.clamp_max = ggml_get_op_params_f32(clamp, 1); - } - -#define GATING_FUNC_SOFTMAX 0 -#define GATING_FUNC_SIGMOID 1 -#define GATING_FUNC_SOFTMAX_WEIGHT 2 - - pc.gating_func = mode == TOPK_MOE_SIGMOID_NORM_BIAS ? GATING_FUNC_SIGMOID : - mode == TOPK_MOE_LATE_SOFTMAX ? GATING_FUNC_SOFTMAX_WEIGHT : - GATING_FUNC_SOFTMAX; - pc.has_bias = mode == TOPK_MOE_SIGMOID_NORM_BIAS; - pc.with_norm = mode == TOPK_MOE_EARLY_SOFTMAX_NORM || mode == TOPK_MOE_SIGMOID_NORM_BIAS; - if (ctx->fused_topk_moe_scale) { - GGML_ASSERT(weights->op == GGML_OP_SCALE); - pc.output_scale = ggml_get_op_params_f32(weights, 0); - pc.output_bias = ggml_get_op_params_f32(weights, 1); - } else { - pc.output_scale = 1.0f; - pc.output_bias = 0.0f; - } - - GGML_ASSERT(n_expert_used <= n_experts); - - const uint32_t rows_per_block = 4; - std::array elements = { CEIL_DIV(n_rows, rows_per_block), 1, 1 }; - - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {logits_buf, bias_buf, weights_buf, ids_buf}, pc, elements); -} - -static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop) { - ggml_tensor * dst = cgraph->nodes[node_idx]; - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - const ggml_tensor * src2 = dst->src[2]; - const ggml_tensor * src3 = nullptr; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - // const int n_ctx = ((int32_t *) dst->op_params)[3]; - const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; - const float freq_base = ((float *) dst->op_params)[5]; - const float beta_fast = ((float *) dst->op_params)[9]; - const float beta_slow = ((float *) dst->op_params)[10]; - int sections[4] {}; - if (mode & GGML_ROPE_TYPE_MROPE) { - memcpy(sections, (int32_t *) dst->op_params + 11, sizeof(int)*4); - } - - float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - - uint32_t set_rows_stride = 0; - // Fused rope + view + set_rows passes the set_rows destination stride in set_rows_stride - // and overrides the dst and sets src3=row_indices - if (ctx->num_additional_fused_ops > 0) { - set_rows_stride = cgraph->nodes[node_idx + 2]->nb[1] / ggml_type_size(cgraph->nodes[node_idx + 2]->type); - src3 = cgraph->nodes[node_idx + 2]->src[1]; - dst = cgraph->nodes[node_idx + 2]; - } - - ggml_vk_op_f32(ctx, subctx, src0, src1, src2, src3, dst, GGML_OP_ROPE, - ggml_vk_make_rope_constants(cgraph->nodes[node_idx], src0, src2 != nullptr, backprop, set_rows_stride)); -} - -static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - const uint32_t * op_params = (const uint32_t *)dst->op_params; - - uint32_t ncols = src0->ne[0]; - uint32_t nrows = ggml_nrows(src0); - - uint32_t ncols_pad_log2 = (uint32_t)ceilf(log2f(float(ncols))); - uint32_t ncolsp2 = 1 << ncols_pad_log2; - - vk_op_argsort_push_constants pc { ncols, ncolsp2, ncols_pad_log2, nrows, op_params[0], 0, 0, 0, 0, }; - - // Pick the largest workgroup size <= ncolsp2 - uint32_t pipeline_idx = std::min(ncols_pad_log2, num_argsort_pipelines - 1); - - // Use the "small" argsort shader if the whole sort can be done by a single workgroup. - bool use_small = ncols_pad_log2 <= ctx->device->max_workgroup_size_log2 && - ctx->device->pipeline_argsort_f32[pipeline_idx] != nullptr; - - vk_pipeline pipeline = use_small ? ctx->device->pipeline_argsort_f32[pipeline_idx] - : ctx->device->pipeline_argsort_large_f32[pipeline_idx]; - - vk_subbuffer src0_buf = ggml_vk_tensor_subbuffer(ctx, src0); - vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); - vk_subbuffer subbuf1 = dst_buf; - - // Reserve space for ivec2 per element, with rows padded to a power of two - if (!use_small) { - const size_t x_sz = size_t{ncolsp2} * nrows * 2 * sizeof(int); - - if (ctx->prealloc_size_x < x_sz) { - ctx->prealloc_size_x = x_sz; - ggml_vk_preallocate_buffers(ctx, subctx); - } - if (ctx->prealloc_x_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - subbuf1 = { ctx->prealloc_x, 0, ctx->prealloc_x->size }; - } - - std::array elements; - - elements[0] = ncolsp2; - elements[1] = std::min((uint32_t)ggml_nrows(src0), ctx->device->properties.limits.maxComputeWorkGroupCount[1]); - elements[2] = 1; - - // First dispatch initializes tmp_idx and does the first N passes where - // there is only communication between threads in the same workgroup. - { - vk_op_argsort_push_constants pc2 = pc; - pc2.outer_start = 0; - pc2.outer_end = std::min(ncols_pad_log2, ctx->device->max_workgroup_size_log2); - pc2.inner_start = 0; - pc2.inner_end = 100; - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc2, elements); - } - if (!use_small) { - ggml_vk_sync_buffers(ctx, subctx); - // Loop over outer/inner passes, synchronizing between each pass. - for (uint32_t outer = ctx->device->max_workgroup_size_log2; outer < ncols_pad_log2; ++outer) { - for (uint32_t inner = 0; inner < outer + 1; ++inner) { - vk_op_argsort_push_constants pc2 = pc; - pc2.outer_start = outer; - pc2.outer_end = outer + 1; - pc2.inner_start = inner; - pc2.inner_end = inner + 1; - // When the inner idx is large enough, there's only communication - // within a workgroup. So the remaining inner iterations can all - // run in the same dispatch. - if (outer - inner < pipeline_idx) { - pc2.inner_end = 100; - inner = outer; - pipeline = ctx->device->pipeline_argsort_large_f32[pipeline_idx]; - } else { - // Smaller workgroup empirically seems to perform better - pipeline = ctx->device->pipeline_argsort_large_f32[pipeline_idx - 2]; - } - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc2, elements); - ggml_vk_sync_buffers(ctx, subctx); - } - } - ctx->prealloc_x_need_sync = true; - } -} - -static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - uint32_t ncols = src0->ne[0]; - uint32_t nrows = ggml_nrows(src0); - uint32_t k = dst->ne[0]; - - vk_op_topk_push_constants pc { ncols, ncols, ncols, k, nrows, 0, 0 }; - - if (ctx->prealloc_x_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - - std::array elements; - elements[1] = std::min(nrows, ctx->device->properties.limits.maxComputeWorkGroupCount[1]); - elements[2] = 1; - - uint32_t num_elements = ncols; - - // Each iteration reduces a workgroup's worth of elements down to the K - // largest elements. Repeat until we have the top K elements. - // Need to do at least one iteration to write out the results. - bool done_one_iter = false; - uint32_t dbl_buf_index = 0; - size_t dbl_buf_size; - while (num_elements > k || !done_one_iter) { - - // Prefer going as small as num_topk_pipelines - 3 for perf reasons. - // But if K is larger, then we need a larger workgroup - uint32_t max_pipeline = num_topk_pipelines - 1; - uint32_t preferred_pipeline = std::max(num_topk_pipelines - 3, (uint32_t)log2f(float(k)) + 2); - max_pipeline = std::min(preferred_pipeline, max_pipeline); - uint32_t min_pipeline = (uint32_t)log2f(float(k)) + 1; - // require full subgroup - min_pipeline = std::max(min_pipeline, ctx->device->subgroup_size_log2); - - uint32_t pipeline_idx = (uint32_t)ceilf(log2f(float(num_elements))); - pipeline_idx = std::min(pipeline_idx, max_pipeline); - pipeline_idx = std::max(pipeline_idx, min_pipeline); - - if (num_elements > (1u << pipeline_idx)) { - // If we could finish on this loop iteration (i.e. a single workgroup) - // then do so. It's better than the overhead of another pass. - for (uint32_t i = pipeline_idx; i < num_topk_pipelines; ++i) { - if (num_elements <= (1u << i)) { - pipeline_idx = i; - break; - } - } - } - - vk_pipeline pipeline = ctx->device->pipeline_topk_f32[pipeline_idx]; - // If the device doesn't support a pipeline this large, use smaller - while (!pipeline) { - pipeline_idx--; - GGML_ASSERT(pipeline_idx >= min_pipeline); - pipeline = ctx->device->pipeline_topk_f32[pipeline_idx]; - } - - vk_op_topk_push_constants pc2 = pc; - pc2.ncols_input = num_elements; - - // Number of elements remaining after this pass - uint32_t num_dst_elements = (num_elements / pipeline->wg_denoms[0]) * k + std::min(k, num_elements % pipeline->wg_denoms[0]); - - pc2.ncols_output = num_dst_elements; - - if (!done_one_iter) { - // Reserve space for ivec2 per element, double buffered - // K per workgroup per row - dbl_buf_size = num_dst_elements * nrows * 2 * sizeof(int); - dbl_buf_size = ROUNDUP_POW2(dbl_buf_size, ctx->device->properties.limits.minStorageBufferOffsetAlignment); - const size_t x_sz = dbl_buf_size * 2; - - if (ctx->prealloc_size_x < x_sz) { - ctx->prealloc_size_x = x_sz; - ggml_vk_preallocate_buffers(ctx, subctx); - } - } - - vk_subbuffer src_buf; - vk_subbuffer dst_buf; - - if (num_elements == ncols) { - pc2.first_pass = 1; - src_buf = ggml_vk_tensor_subbuffer(ctx, src0); - } else { - src_buf = { ctx->prealloc_x, dbl_buf_index * dbl_buf_size, dbl_buf_size }; - } - if (num_dst_elements == k) { - pc2.last_pass = 1; - dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); - } else { - dst_buf = { ctx->prealloc_x, (dbl_buf_index ^ 1) * dbl_buf_size, dbl_buf_size }; - } - - elements[0] = num_elements; - - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src_buf, dst_buf }, pc2, elements); - num_elements = num_dst_elements; - dbl_buf_index ^= 1; - if (num_elements > k) { - ggml_vk_sync_buffers(ctx, subctx); - } - done_one_iter = true; - } - ctx->prealloc_x_need_sync = true; -} - -static void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, ggml_nelements(src0)); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM, p); -} - -static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, p); -} - -static void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]); - p.weight = 1.0f / (float)src0->ne[0]; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_MEAN, p); -} - -static void ggml_vk_cumsum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - vk_op_sum_rows_push_constants pc = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]); - // Use the single pass shader when the rows are small or there are enough rows to fill the GPU. - // For fewer, larger rows, use the multipass shader to spread each row across SMs. - if (dst->ne[0] <= 4096 || ggml_nrows(dst) >= ctx->device->shader_core_count) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CUMSUM, pc); - return; - } - - // First pass computes partial sums within a block, and stores the last partial - // to the temp buffer. Second pass sums the block partials from the temp buffer - // and adds that to the result of the first pass. - vk_pipeline pipeline1 = ctx->device->pipeline_cumsum_multipass1_f32; - vk_pipeline pipeline2 = ctx->device->pipeline_cumsum_multipass2_f32; - GGML_ASSERT(pipeline1 != nullptr && pipeline2 != nullptr); - - ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1); - ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1); - - std::array elements; - - elements[0] = dst->ne[0]; - elements[1] = (uint32_t)ggml_nrows(dst); - elements[2] = 1; - - size_t temp_size = sizeof(float) * elements[0] * ggml_nrows(dst); - - if (ctx->prealloc_size_split_k < temp_size) { - ctx->prealloc_size_split_k = temp_size; - ggml_vk_preallocate_buffers(ctx, subctx); - } - - vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src0); - vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); - vk_subbuffer temp_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0); - - if (ctx->prealloc_split_k_need_sync) { - ggml_vk_sync_buffers(ctx, subctx); - } - - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, {src_buf, dst_buf, temp_buf}, pc, elements); - ggml_vk_sync_buffers(ctx, subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, {src_buf, dst_buf, temp_buf}, pc, elements); - - ctx->prealloc_split_k_need_sync = true; -} - -static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f, 0.0f, 0.0f }); -} - -static void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_COUNT_EQUAL, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f }); -} - -static void ggml_vk_solve_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t src1_type_size = ggml_type_size(src1->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOLVE_TRI, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, 0, - }); -} - -static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const int32_t s0 = dst->op_params[0]; - const int32_t s1 = dst->op_params[1]; - const int32_t p0 = dst->op_params[2]; - const int32_t p1 = dst->op_params[3]; - const int32_t d0 = dst->op_params[4]; - const int32_t d1 = dst->op_params[5]; - - const bool is_2D = dst->op_params[6] == 1; - - const uint32_t IC = src1->ne[is_2D ? 2 : 1]; - const uint32_t IH = is_2D ? src1->ne[1] : 1; - const uint32_t IW = src1->ne[0]; - - const uint32_t KH = is_2D ? src0->ne[1] : 1; - const uint32_t KW = src0->ne[0]; - - const uint32_t OH = is_2D ? dst->ne[2] : 1; - const uint32_t OW = dst->ne[1]; - - const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 - const uint32_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32 - - const uint32_t batch = src1->ne[is_2D ? 3 : 2]; - - const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; - const vk_buffer d_buf = d_buf_ctx->dev_buffer; - - const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_offset(dst) + dst->view_offs; - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL, { - dst_addr, - batch_offset, offset_delta, - IC, IW, IH, OW, OH, KW, KH, - OH * batch, - IC * KH * KW, - s0, s1, p0, p1, d0, d1, batch * IC - }); -} - -static void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_TENSOR_BINARY_OP_LOCALS - - const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; - const int32_t s2 = ((const int32_t *)(dst->op_params))[2]; - const int32_t p0 = ((const int32_t *)(dst->op_params))[3]; - const int32_t p1 = ((const int32_t *)(dst->op_params))[4]; - const int32_t p2 = ((const int32_t *)(dst->op_params))[5]; - const int32_t d0 = ((const int32_t *)(dst->op_params))[6]; - const int32_t d1 = ((const int32_t *)(dst->op_params))[7]; - const int32_t d2 = ((const int32_t *)(dst->op_params))[8]; - const int32_t IC = ((const int32_t *)(dst->op_params))[9]; - - const int64_t N = ne13 / IC; - const int64_t ID = ne12; - const int64_t IH = ne11; - const int64_t IW = ne10; - - const int64_t KD = ne02; - const int64_t KH = ne01; - const int64_t KW = ne00; - - const int64_t OD = ne3 / N; - const int64_t OH = ne2; - const int64_t OW = ne1; - - const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; - const vk_buffer d_buf = d_buf_ctx->dev_buffer; - - const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_offset(dst) + dst->view_offs; - - vk_op_im2col_3d_push_constants pc {}; - - pc.dst_addr = dst_addr; - pc.nb10 = nb10 / ggml_type_size(src1->type); - pc.nb11 = nb11 / ggml_type_size(src1->type); - pc.nb12 = nb12 / ggml_type_size(src1->type); - pc.nb13 = nb13 / ggml_type_size(src1->type); - pc.s0 = s0; - pc.s1 = s1; - pc.s2 = s2; - pc.p0 = p0; - pc.p1 = p1; - pc.p2 = p2; - pc.d0 = d0; - pc.d1 = d1; - pc.d2 = d2; - pc.IW = IW; - pc.IH = IH; - pc.ID = ID; - pc.IC = IC; - pc.KW = KW; - pc.OH = OH; - pc.KD_KH_KW = KD*KH*KW; - pc.KH_KW = KH*KW; - pc.IC_KD_KH_KW = IC*KD*KH*KW; - pc.N_OD_OH = N*OD*OH; - pc.OD_OH = OD*OH; - pc.OD_OH_OW_IC_KD_KH_KW = OD*OH*OW*IC*KD*KH*KW; - pc.OH_OW_IC_KD_KH_KW = OH*OW*IC*KD*KH*KW; - pc.OW_IC_KD_KH_KW = OW*IC*KD*KH*KW; - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL_3D, std::move(pc)); -} - -static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - const uint32_t dim = dst->op_params[0]; - const uint32_t max_period = dst->op_params[1]; - const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, { - nb1, dim, max_period, - }); -} - -static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - // src0: (K, Cout, Cin, 1) -- kernel - // src1: (L, Cin, 1, 1) -- input - // dst: (*, Cout, 1, 1) - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(nb00 == sizeof(float)); - GGML_ASSERT(nb10 == sizeof(float)); - - const int32_t s0 = dst->op_params[0]; - - vk_op_conv_transpose_1d_push_constants p{}; - p.Cout = static_cast(ne01); - p.Cin = static_cast(ne02); - p.K = static_cast(ne00); - p.L = static_cast(ne10); - p.KL = static_cast(ne0); - p.nb01 = static_cast(nb01 / nb00); - p.nb02 = static_cast(nb02 / nb00); - p.nb11 = static_cast(nb11 / nb10); - p.nb1 = static_cast(nb1 / nb0); - p.s0 = static_cast(s0); - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p)); -} - -static void ggml_vk_col2im_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - // src0: [K_OC, T_in] columns from matmul - // dst: [T_out, OC] - - const int32_t stride = dst->op_params[0]; - const int32_t oc = dst->op_params[1]; - const int32_t p0 = dst->op_params[2]; - - const uint32_t K_OC = static_cast(src0->ne[0]); - const uint32_t T_in = static_cast(src0->ne[1]); - const uint32_t T_out = static_cast(dst->ne[0]); - const uint32_t OC = static_cast(oc); - const uint32_t K = K_OC / OC; - - vk_op_col2im_1d_push_constants p{}; - p.T_out = T_out; - p.OC = OC; - p.K_OC = K_OC; - p.T_in = T_in; - p.K = K; - p.stride = stride; - p.p0 = p0; - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COL2IM_1D, std::move(p)); -} - -// Dispatch the fused snake activation: y = x + sin^2(a * x) * inv_b. -// Match the naive mul -> sin -> sqr -> mul -> add chain and run the -// dedicated kernel directly. The pattern is validated by -// ggml_vk_can_fuse_snake before this call. -static void ggml_vk_snake_dispatch_fused(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) { - const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0]; - const ggml_tensor * sqr = cgraph->nodes[node_idx + 2]; - const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3]; - ggml_tensor * add = cgraph->nodes[node_idx + 4]; - - // x carries the full activation shape, a is the broadcast operand - const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1]; - const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0]; - - // mul1 reads sqr and inv_b in either operand order - const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0]; - - vk_pipeline pipeline = nullptr; - switch (x->type) { - case GGML_TYPE_F32: pipeline = ctx->device->pipeline_snake_f32; break; - case GGML_TYPE_F16: pipeline = ctx->device->pipeline_snake_f16; break; - case GGML_TYPE_BF16: pipeline = ctx->device->pipeline_snake_bf16; break; - default: GGML_ABORT("unsupported type"); - } - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - - vk_subbuffer x_buf = ggml_vk_tensor_subbuffer(ctx, x); - vk_subbuffer a_buf = ggml_vk_tensor_subbuffer(ctx, a); - vk_subbuffer inv_b_buf = ggml_vk_tensor_subbuffer(ctx, inv_b); - vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, add); - - vk_op_snake_push_constants pc{}; - pc.ne0 = static_cast(x->ne[0]); - pc.ne1 = static_cast(x->ne[1]); - - std::array elements = { pc.ne0, pc.ne1, 1 }; - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { x_buf, a_buf, inv_b_buf, dst_buf }, pc, elements); -} - -static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - uint32_t op = static_cast(dst->op_params[0]); - const int32_t k1 = dst->op_params[1]; - const int32_t k0 = dst->op_params[2]; - const int32_t s1 = dst->op_params[3]; - const int32_t s0 = dst->op_params[4]; - const int32_t p1 = dst->op_params[5]; - const int32_t p0 = dst->op_params[6]; - - const uint32_t IH = src0->ne[1]; - const uint32_t IW = src0->ne[0]; - - const uint32_t N = dst->ne[3]; - - const uint32_t OC = dst->ne[2]; - const uint32_t OH = dst->ne[1]; - const uint32_t OW = dst->ne[0]; - - const uint32_t parallel_elements = N * OC * OH * OW; - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_POOL_2D, { - IW, IH, OW, OH, OC, - parallel_elements, - op, - k0, k1, s0, s1, p0, p1, - }); -} - -static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - GGML_TENSOR_BINARY_OP_LOCALS - GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb10 == sizeof(float)); - GGML_ASSERT(nb0 == sizeof(float)); - - bool transpose = dst->op == GGML_OP_CONV_TRANSPOSE_2D; - - vk_op_conv2d_push_constants p{}; - p.Cout = static_cast(!transpose ? ne03 : ne02); - p.Cin = static_cast(!transpose ? ne02 : ne03); - p.N = static_cast(ne13); - GGML_ASSERT(p.Cout == ne2); - GGML_ASSERT(p.Cin == ne12); - - p.W = static_cast(ne10); - p.H = static_cast(ne11); - p.OW = static_cast(ne0); - p.OH = static_cast(ne1); - - p.nb01 = static_cast(nb01 / nb00); - p.nb02 = static_cast(nb02 / nb00); - p.nb03 = static_cast(nb03 / nb00); - - p.nb11 = static_cast(nb11 / nb10); - p.nb12 = static_cast(nb12 / nb10); - p.nb13 = static_cast(nb13 / nb10); - - p.nb1 = static_cast(nb1 / nb0); - p.nb2 = static_cast(nb2 / nb0); - p.nb3 = static_cast(nb3 / nb0); - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, dst->op, std::move(p)); -} - -static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - vk_op_conv2d_dw_push_constants p{}; - p.ne = ggml_nelements(dst); - p.channels = dst->ne[2]; - p.batches = dst->ne[3]; - p.dst_w = dst->ne[0]; - p.dst_h = dst->ne[1]; - p.src_w = src1->ne[0]; - p.src_h = src1->ne[1]; - p.knl_w = src0->ne[0]; - p.knl_h = src0->ne[1]; - p.stride_x = dst->op_params[0]; - p.stride_y = dst->op_params[1]; - p.pad_x = dst->op_params[2]; - p.pad_y = dst->op_params[3]; - p.dilation_x = dst->op_params[4]; - p.dilation_y = dst->op_params[5]; - - GGML_ASSERT(src0->ne[3] == p.channels); - GGML_ASSERT(src1->ne[3] == p.batches); - - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D_DW, std::move(p)); -} - -static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - const float * op_params = (const float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f, 0.0f, 0.0f }); -} - -#ifdef GGML_VULKAN_RUN_TESTS -static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) { - if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) { - return; - } - i0 = std::max(i0, 5); - i1 = std::max(i1, 5); - i2 = std::max(i2, 0); - fprintf(stderr, " "); - for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { - fprintf(stderr, "%7d ", idx1); - } - fprintf(stderr, "\n"); - for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) { - fprintf(stderr, "%7d: ", idx0); - for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { - if (idx0 >= 0 && idx0 < ne0 && idx1 >= 0 && idx1 < ne1) { - float val; - if (type == GGML_TYPE_F32) { - val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0); - } else if (type == GGML_TYPE_F16) { - val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0)); - } else { - GGML_ABORT("fatal error"); - } - fprintf(stderr, "% 7.2f ", val); - } else { - fprintf(stderr, " "); - } - } - fprintf(stderr, "\n"); - } -} - -template -static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) { - VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")"); - const size_t x_ne = m * k * batch; - const size_t y_ne = k * n * batch; - const size_t d_ne = m * n * batch; - - vk_pipeline p; - std::string shname; - if (shader_size == 0) { - if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f32->a_s; - shname = "F32_ALIGNED_S"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f32_f16->a_s; - shname = "F32_F16_ALIGNED_S"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_s; - shname = "F16_F32_ALIGNED_S"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f16.f32acc->a_s; - shname = "F16_ALIGNED_S"; - } else { - GGML_ABORT("fatal error"); - } - } else if (shader_size == 1) { - if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f32->a_m; - shname = "F32_ALIGNED_M"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f32_f16->a_m; - shname = "F32_F16_ALIGNED_M"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_m; - shname = "F16_F32_ALIGNED_M"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f16.f32acc->a_m; - shname = "F16_ALIGNED_M"; - } else { - GGML_ABORT("fatal error"); - } - } else if (shader_size == 2) { - if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f32->a_l; - shname = "F32_ALIGNED_L"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f32_f16->a_l; - shname = "F32_F16_ALIGNED_L"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_l; - shname = "F16_F32_ALIGNED_L"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f16.f32acc->a_l; - shname = "F16_ALIGNED_L"; - } else { - GGML_ABORT("fatal error"); - } - } else { - GGML_ASSERT(0); - } - - const size_t kpad = ggml_vk_align_size(k, p->align); - - if (k != kpad) { - if (shader_size == 0) { - if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f32->s; - shname = "F32_S"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f32_f16->s; - shname = "F32_F16_S"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f16_f32.f32acc->s; - shname = "F16_F32_S"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f16.f32acc->s; - shname = "F16_S"; - } - } else if (shader_size == 1) { - if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f32->m; - shname = "F32_M"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f32_f16->m; - shname = "F32_F16_M"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f16_f32.f32acc->m; - shname = "F16_F32_M"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f16.f32acc->m; - shname = "F16_M"; - } - } else if (shader_size == 2) { - if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f32->l; - shname = "F32_L"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f32_f16->l; - shname = "F32_F16_L"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f16_f32.f32acc->l; - shname = "F16_F32_L"; - } else if (std::is_same() && std::is_same()) { - p = ctx->device->pipeline_matmul_f16.f32acc->l; - shname = "F16_L"; - } - } - } - - if (split_k > 1) { - ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it); - - if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { - // Resize buffer - if (ctx->prealloc_split_k != nullptr) { - ggml_vk_destroy_buffer(ctx->prealloc_split_k); - } - ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, {vk::MemoryPropertyFlagBits::eDeviceLocal}); - } - } - - ggml_pipeline_allocate_descriptor_sets(ctx); - - vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, {vk::MemoryPropertyFlagBits::eDeviceLocal}); - vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, {vk::MemoryPropertyFlagBits::eDeviceLocal}); - vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, {vk::MemoryPropertyFlagBits::eDeviceLocal}); - - X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne); - Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne); - float* d = (float *) malloc(sizeof(float) * d_ne); - - for (size_t i = 0; i < x_ne; i++) { - if (std::is_same()) { - x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; - // x[i] = 1.0f; - // x[i] = i + 1; - // x[i] = (i % k == i / k) ? 1.0f : 0.0f; - } else if (std::is_same()) { - x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); - // x[i] = ggml_fp32_to_fp16(1.0f); - // x[i] = ggml_fp32_to_fp16(i + 1); - // x[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); - } else { - GGML_ABORT("fatal error"); - } - } - for (size_t i = 0; i < y_ne; i++) { - if (std::is_same()) { - y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; - // y[i] = (i % k == i / k) ? 1.0f : 0.0f; - // y[i] = i + 1; - } else if (std::is_same()) { - y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); - // y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); - // y[i] = ggml_fp32_to_fp16(i + 1); - } else { - GGML_ABORT("fatal error"); - } - } - - ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch); - ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); - - vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - ggml_vk_ctx_begin(ctx->device, subctx); - for (size_t i = 0; i < num_it; i++) { - ggml_vk_matmul( - ctx, subctx, p, ggml_vk_subbuffer(ctx, d_X), ggml_vk_subbuffer(ctx, d_Y), ggml_vk_subbuffer(ctx, d_D), ggml_vk_subbuffer(ctx, ctx->prealloc_split_k), - m, n, k, - k, k, m, k*m, k*n, m*n, - split_k, batch, batch, batch, 1, 1, n - ); - } - ggml_vk_ctx_end(subctx); - - auto begin = std::chrono::high_resolution_clock::now(); - ggml_vk_submit(subctx, ctx->fence); - VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences"); - ctx->device->device.resetFences({ ctx->fence }); - ggml_vk_queue_command_pools_cleanup(ctx->device); - - auto end = std::chrono::high_resolution_clock::now(); - double time = std::chrono::duration_cast(end-begin).count() / 1000.0; - - // copy dst to host - ggml_vk_buffer_read(d_D, 0, d, sizeof(float) * d_ne); - - float * d_chk = (float *) malloc(sizeof(float) * d_ne); - - ggml_init_params iparams = { - /*.mem_size =*/ 1024*1024*1024, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - - ggml_context * ggml_ctx = ggml_init(iparams); - - ggml_type src0_type; - ggml_type src1_type; - - if (std::is_same()) { - src0_type = GGML_TYPE_F32; - } else if (std::is_same()) { - src0_type = GGML_TYPE_F16; - } else { - GGML_ABORT("fatal error"); - } - if (std::is_same()) { - src1_type = GGML_TYPE_F32; - } else if (std::is_same()) { - src1_type = GGML_TYPE_F16; - } else { - GGML_ABORT("fatal error"); - } - - ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch); - ggml_tensor * src1_ggml = ggml_new_tensor_3d(ggml_ctx, src1_type, k, n, batch); - ggml_tensor * tensor_ggml = ggml_mul_mat(ggml_ctx, src0_ggml, src1_ggml); - - src0_ggml->data = x; - src1_ggml->data = y; - tensor_ggml->data = d_chk; - - ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); - ggml_build_forward_expand(cgraph, tensor_ggml); - - ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1); - - ggml_free(ggml_ctx); - - double avg_err = 0.0; - int first_err_n = -1; - int first_err_m = -1; - int first_err_b = -1; - - for (size_t i = 0; i < m*n*batch; i++) { - double err = std::fabs(d[i] - d_chk[i]); - avg_err += err; - - if ((err > 0.05f || std::isnan(err)) && first_err_n == -1) { - first_err_b = i / (m * n); - first_err_n = (i % (m * n)) / m; - first_err_m = (i % (m * n)) % m; - } - } - - avg_err /= m * n; - - double tflops = 2.0*m*n*k*batch*num_it / (time / 1000.0) / (1000.0*1000.0*1000.0*1000.0); - - std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl; - - if (avg_err > 0.1 || std::isnan(avg_err)) { - std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; - std::cerr << "Actual result: " << std::endl << std::endl; - ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); - std::cerr << "Expected result: " << std::endl << std::endl; - ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); - - if (split_k > 1) { - float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k); - ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k); - - std::cerr << "d_buf0: " << std::endl << std::endl; - ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); - - std::cerr << "d_buf1: " << std::endl << std::endl; - ggml_vk_print_matrix_area(split_k_buf + d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); - - std::cerr << "d_buf2: " << std::endl << std::endl; - ggml_vk_print_matrix_area(split_k_buf + 2 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); - - std::cerr << "d_buf3: " << std::endl << std::endl; - ggml_vk_print_matrix_area(split_k_buf + 3 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); - - free(split_k_buf); - } - } - - free(d_chk); - - ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool); - - ggml_vk_destroy_buffer(d_X); - ggml_vk_destroy_buffer(d_Y); - ggml_vk_destroy_buffer(d_D); - - free(x); - free(y); - free(d); -} - -static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1, int i2, int i3) { - if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) { - return; - } - i0 = std::max(i0, 5); - i1 = std::max(i1, 5); - i2 = std::max(i2, 0); - i3 = std::max(i3, 0); - fprintf(stderr, " "); - for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { - fprintf(stderr, "%7d ", idx1); - } - fprintf(stderr, "\n"); - for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) { - fprintf(stderr, "%7d: ", idx0); - for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { - if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) { - float val; - if (tensor->type == GGML_TYPE_F32) { - val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); - } else if (tensor->type == GGML_TYPE_F16) { - val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); - } else { - GGML_ABORT("fatal error"); - } - fprintf(stderr, "% 7.2f ", val); - } else { - fprintf(stderr, " "); - } - } - fprintf(stderr, "\n"); - } -} - -static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) { - ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr); -} - -static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) { - if (quant == GGML_TYPE_F32) { - memcpy(to, from, sizeof(float) * ne); - return; - } - - const auto * tt = ggml_get_type_traits(quant); - - ggml_to_float_t dequant_fn = tt->to_float; - - dequant_fn(from, to, ne); -} - -static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { - VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")"); - const size_t x_sz = sizeof(float) * ne; - const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne; - const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant); - float * x = (float *) malloc(x_sz); - void * qx = malloc(qx_sz); - vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); - vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz_f16, {vk::MemoryPropertyFlagBits::eDeviceLocal}); - float * x_ref = (float *) malloc(x_sz); - ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16); - - for (size_t i = 0; i < ne; i++) { - x[i] = rand() / (float)RAND_MAX; - } - - vk_pipeline p = ggml_vk_get_to_fp16(ctx, quant); - - ggml_vk_quantize_data(x, qx, ne, quant); - ggml_vk_dequantize_data(qx, x_ref, ne, quant); - - ggml_pipeline_request_descriptor_sets(ctx, p, 1); - - ggml_pipeline_allocate_descriptor_sets(ctx); - - ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); - - vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - ggml_vk_ctx_begin(ctx->device, subctx); - const std::vector pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne }; - ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1}); - ggml_vk_ctx_end(subctx); - - auto begin = std::chrono::high_resolution_clock::now(); - - ggml_vk_submit(subctx, ctx->fence); - VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences"); - ctx->device->device.resetFences({ ctx->fence }); - ggml_vk_queue_command_pools_cleanup(ctx->device); - - auto end = std::chrono::high_resolution_clock::now(); - - double ms_dequant = std::chrono::duration_cast(end-begin).count() / 1000.0; - ggml_vk_buffer_read(x_buf, 0, x_chk, x_sz_f16); - - int first_err = -1; - - double avg_err = 0.0; - for (size_t i = 0; i < ne; i++) { - double error = std::fabs(x_ref[i] - ggml_fp16_to_fp32(x_chk[i])); - avg_err += error; - - if (first_err < 0 && error > 0.05) { - first_err = i; - } - } - - avg_err /= ne; - - std::cerr << "TEST DEQUANT " << ggml_type_name(quant) << " time=" << ms_dequant << "ms avg_err=" << avg_err << std::endl; - - if (avg_err > 0.1) { - std::cerr << "first_error = " << first_err << std::endl; - std::cerr << "Actual result: " << std::endl << std::endl; - for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) { - std::cerr << ggml_fp16_to_fp32(x_chk[i]) << ", "; - } - std::cerr << std::endl << "Expected result: " << std::endl << std::endl; - for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) { - std::cerr << x_ref[i] << ", "; - } - std::cerr << std::endl; - } - - ggml_vk_destroy_buffer(x_buf); - ggml_vk_destroy_buffer(qx_buf); - - free(x); - free(qx); - free(x_ref); - free(x_chk); -} - -// This does not work without ggml q8_1 quantization support -// -// typedef uint16_t ggml_half; -// typedef uint32_t ggml_half2; -// -// #define QK8_1 32 -// typedef struct { -// union { -// struct { -// ggml_half d; // delta -// ggml_half s; // d * sum(qs[i]) -// } GGML_COMMON_AGGR_S; -// ggml_half2 ds; -// } GGML_COMMON_AGGR_U; -// int8_t qs[QK8_1]; // quants -// } block_q8_1; -// -// static void ggml_vk_test_quantize(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { -// VK_LOG_DEBUG("ggml_vk_test_quantize(" << ne << ")"); -// GGML_ASSERT(quant == GGML_TYPE_Q8_1); -// -// const size_t x_sz = sizeof(float) * ne; -// const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant); -// float * x = (float *) malloc(x_sz); -// block_q8_1 * qx = (block_q8_1 *)malloc(qx_sz); -// block_q8_1 * qx_res = (block_q8_1 *)malloc(qx_sz); -// vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); -// vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); -// -// for (size_t i = 0; i < ne; i++) { -// x[i] = rand() / (float)RAND_MAX; -// } -// -// vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant); -// -// ggml_pipeline_request_descriptor_sets(ctx, p, 1); -// -// ggml_pipeline_allocate_descriptor_sets(ctx); -// -// ggml_vk_buffer_write(x_buf, 0, x, x_sz); -// -// vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); -// ggml_vk_ctx_begin(ctx->device, subctx); -// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, x_buf), ggml_vk_subbuffer(ctx, qx_buf), ne); -// ggml_vk_ctx_end(subctx); -// -// auto begin = std::chrono::high_resolution_clock::now(); -// -// ggml_vk_submit(subctx, ctx->fence); -// VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences"); -// ctx->device->device.resetFences({ ctx->fence }); -// ggml_vk_queue_command_pools_cleanup(ctx->device); -// -// auto end = std::chrono::high_resolution_clock::now(); -// -// double ms_quant = std::chrono::duration_cast(end-begin).count() / 1000.0; -// ggml_vk_buffer_read(qx_buf, 0, qx, qx_sz); -// -// ggml_vk_quantize_data(x, qx_res, ne, quant); -// -// int first_err = -1; -// -// for (size_t i = 0; i < ne / 32; i++) { -// double error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d)); -// -// if (first_err < 0 && error > 0.1) { -// first_err = i; -// } -// -// error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s)); -// -// if (first_err < 0 && error > 0.1) { -// first_err = i; -// } -// -// for (size_t j = 0; j < 32; j++) { -// uint64_t error = std::abs(qx_res[i].qs[j] - qx[i].qs[j]); -// -// if (first_err < 0 && error > 1) { -// first_err = i; -// } -// } -// } -// -// std::cerr << "TEST QUANTIZE " << ggml_type_name(quant) << " time=" << ms_quant << "ms " << (first_err == -1 ? "CORRECT" : "INCORRECT") << std::endl; -// -// if (first_err != -1) { -// std::cerr << "first_error = " << first_err << std::endl; -// std::cerr << "Actual result: " << std::endl << std::endl; -// std::cout << "d=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " "; -// for (size_t j = 0; j < 32; j++) { -// std::cout << " qs" << j << "=" << (uint32_t)qx[first_err].qs[j] << " "; -// } -// std::cerr << std::endl << std::endl << "Expected result: " << std::endl << std::endl; -// std::cout << "d=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " "; -// for (size_t j = 0; j < 32; j++) { -// std::cout << " qs" << j << "=" << (uint32_t)qx_res[first_err].qs[j] << " "; -// } -// std::cerr << std::endl; -// } -// -// ggml_vk_destroy_buffer(x_buf); -// ggml_vk_destroy_buffer(qx_buf); -// -// free(x); -// free(qx); -// free(qx_res); -// } - -static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant, bool mmq = false) { - VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")"); - const size_t x_ne = m * k * batch; - const size_t y_ne = k * n * batch; - const size_t d_ne = m * n * batch; - - vk_matmul_pipeline2 * pipelines; - - if (mmq) { - pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1; - } else { - pipelines = ctx->device->pipeline_dequant_mul_mat_mat; - } - - const bool fp16acc = ctx->device->fp16; - - vk_pipeline p; - std::string shname; - if (shader_size == 0) { - p = fp16acc ? pipelines[quant].f16acc->a_s : pipelines[quant].f32acc->a_s; - shname = std::string(ggml_type_name(quant)) + "_ALIGNED_S"; - } else if (shader_size == 1) { - p = fp16acc ? pipelines[quant].f16acc->a_m : pipelines[quant].f32acc->a_m; - shname = std::string(ggml_type_name(quant)) + "_ALIGNED_M"; - } else if (shader_size == 2) { - p = fp16acc ? pipelines[quant].f16acc->a_l : pipelines[quant].f32acc->a_l; - shname = std::string(ggml_type_name(quant)) + "_ALIGNED_L"; - } else { - GGML_ASSERT(0); - } - - const size_t kpad = mmq ? 0 : ggml_vk_align_size(k, p->align); - - if (mmq || k != kpad) { - if (shader_size == 0) { - p = fp16acc ? pipelines[quant].f16acc->s : pipelines[quant].f32acc->s; - shname = std::string(ggml_type_name(quant)) + "_S"; - } else if (shader_size == 1) { - p = fp16acc ? pipelines[quant].f16acc->m : pipelines[quant].f32acc->m; - shname = std::string(ggml_type_name(quant)) + "_M"; - } else if (shader_size == 2) { - p = fp16acc ? pipelines[quant].f16acc->l : pipelines[quant].f32acc->l; - shname = std::string(ggml_type_name(quant)) + "_L"; - } else { - GGML_ASSERT(0); - } - } - - if (p == nullptr) { - std::cerr << "error: no pipeline for ggml_vk_test_dequant_matmul " << ggml_type_name(quant) << std::endl; - return; - } - - const size_t x_sz = sizeof(float) * x_ne; - const size_t y_sz = sizeof(float) * y_ne; - const size_t qx_sz = x_ne * ggml_type_size(quant)/ggml_blck_size(quant); - const size_t qy_sz = mmq ? y_ne * ggml_type_size(GGML_TYPE_Q8_1)/ggml_blck_size(GGML_TYPE_Q8_1) : y_sz; - const size_t d_sz = sizeof(float) * d_ne; - float * x = (float *) malloc(x_sz); - float * y = (float *) malloc(y_sz); - void * qx = malloc(qx_sz); - vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); - vk_buffer y_buf = ggml_vk_create_buffer_check(ctx->device, y_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); - vk_buffer qy_buf = ggml_vk_create_buffer_check(ctx->device, qy_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); - vk_buffer d_buf = ggml_vk_create_buffer_check(ctx->device, d_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); - float * d = (float *) malloc(d_sz); - float * d_chk = (float *) malloc(d_sz); - - for (size_t i = 0; i < x_ne; i++) { - x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; - // x[i] = (i % k == i / k) ? 1.0f : 0.0f; - // x[i] = i % k; - } - - ggml_vk_quantize_data(x, qx, x_ne, quant); - - for (size_t i = 0; i < y_ne; i++) { - y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; - // y[i] = (i % k == i / k) ? 1.0f : 0.0f; - // y[i] = i % k; - } - - if (split_k > 1) { - ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it); - - if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { - // Resize buffer - if (ctx->prealloc_split_k != nullptr) { - ggml_vk_destroy_buffer(ctx->prealloc_split_k); - } - ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, {vk::MemoryPropertyFlagBits::eDeviceLocal}); - } - } - if (mmq) { - vk_pipeline pipeline_quantize_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); - ggml_pipeline_request_descriptor_sets(ctx, pipeline_quantize_q8_1, num_it); - } - - ggml_pipeline_allocate_descriptor_sets(ctx); - - ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); - ggml_vk_buffer_write(y_buf, 0, y, y_sz); - - vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - ggml_vk_ctx_begin(ctx->device, subctx); - if (mmq) { - for (size_t i = 0; i < num_it; i++) { - ggml_vk_quantize_q8_1(ctx, subctx, { y_buf, 0, y_sz }, { qy_buf, 0, qy_sz }, y_ne); - ggml_vk_matmul( - ctx, subctx, p, { qx_buf, 0, qx_sz }, { qy_buf, 0, qy_sz }, { d_buf, 0, d_sz }, { ctx->prealloc_split_k, 0, ctx->prealloc_size_split_k }, - m, n, k, - k, k, m, k*m, k*n, m*n, - split_k, batch, batch, batch, 1, 1, n - ); - } - } else { - for (size_t i = 0; i < num_it; i++) { - ggml_vk_matmul( - ctx, subctx, p, { qx_buf, 0, qx_sz }, { y_buf, 0, y_sz }, { d_buf, 0, d_sz }, { ctx->prealloc_split_k, 0, ctx->prealloc_size_split_k }, - m, n, k, - k, k, m, k*m, k*n, m*n, - split_k, batch, batch, batch, 1, 1, n - ); - } - } - ggml_vk_ctx_end(subctx); - - auto begin = std::chrono::high_resolution_clock::now(); - - ggml_vk_submit(subctx, ctx->fence); - VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences"); - ctx->device->device.resetFences({ ctx->fence }); - ggml_vk_queue_command_pools_cleanup(ctx->device); - - auto end = std::chrono::high_resolution_clock::now(); - - double time_ms = std::chrono::duration_cast(end-begin).count() / 1000.0; - ggml_vk_buffer_read(d_buf, 0, d, d_sz); - - ggml_init_params iparams = { - /*.mem_size =*/ 1024*1024*1024, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - - ggml_context * ggml_ctx = ggml_init(iparams); - - ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, quant, k, m, batch); - ggml_tensor * src1_ggml = ggml_new_tensor_3d(ggml_ctx, GGML_TYPE_F32, k, n, batch); - ggml_tensor * tensor_ggml = ggml_mul_mat(ggml_ctx, src0_ggml, src1_ggml); - - src0_ggml->data = qx; - src1_ggml->data = y; - tensor_ggml->data = d_chk; - - ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); - ggml_build_forward_expand(cgraph, tensor_ggml); - - ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1); - - ggml_free(ggml_ctx); - - double avg_err = 0.0; - int first_err_n = -1; - int first_err_m = -1; - int first_err_b = -1; - - for (size_t i = 0; i < m*n*batch; i++) { - double err = std::fabs(d[i] - d_chk[i]); - avg_err += err; - - if ((err > 0.05f || std::isnan(err)) && first_err_n == -1) { - first_err_b = i / (m * n); - first_err_n = (i % (m * n)) / m; - first_err_m = (i % (m * n)) % m; - } - } - - avg_err /= m * n; - - double tflops = 2.0*m*n*k*batch*num_it / (time_ms / 1000.0) / (1000.0*1000.0*1000.0*1000.0); - - std::cerr << "TEST dequant matmul " << shname; - if (mmq) { - std::cerr << " mmq"; - } - std::cerr << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl; - - if (avg_err > 0.01 || std::isnan(avg_err)) { - std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; - std::cerr << "Actual result: " << std::endl << std::endl; - ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); - std::cerr << std::endl; - std::cerr << "Expected result: " << std::endl << std::endl; - ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); - - std::cerr << "src0: " << std::endl << std::endl; - ggml_vk_print_matrix_area(x, GGML_TYPE_F32, k, m, first_err_m, first_err_n, first_err_b); - std::cerr << std::endl; - std::cerr << "src1: " << std::endl << std::endl; - ggml_vk_print_matrix_area(y, GGML_TYPE_F32, k, n, first_err_m, first_err_n, first_err_b); - - if (split_k > 1) { - float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k); - ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k); - - std::cerr << "d_buf0: " << std::endl << std::endl; - ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); - - std::cerr << "d_buf1: " << std::endl << std::endl; - ggml_vk_print_matrix_area(split_k_buf + d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); - - std::cerr << "d_buf2: " << std::endl << std::endl; - ggml_vk_print_matrix_area(split_k_buf + 2 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); - - std::cerr << "d_buf3: " << std::endl << std::endl; - ggml_vk_print_matrix_area(split_k_buf + 3 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); - - free(split_k_buf); - } - } - - ggml_vk_destroy_buffer(qx_buf); - ggml_vk_destroy_buffer(y_buf); - ggml_vk_destroy_buffer(qy_buf); - ggml_vk_destroy_buffer(d_buf); - - free(x); - free(qx); - free(y); - free(d); - free(d_chk); -} -#endif - -static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx) { -#if defined(GGML_VULKAN_RUN_TESTS) - const std::vector vals { - 512, 512, 128, - 128, 512, 512, - 4096, 512, 4096, - 11008, 512, 4096, - 4096, 512, 11008, - 32000, 512, 4096, - 8, 8, 8, - 100, 46, 576, - 623, 111, 128, - 100, 46, 558, - 512, 1, 256, - 128, 110, 622, - 511, 511, 127, - 511, 511, 7, - 511, 511, 17, - 49, 49, 128, - 128, 49, 49, - 4096, 49, 4096, - }; - const size_t num_it = 100; - - ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q4_0); - ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q4_0); - ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q4_0); - - ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q4_0, true); - ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q4_0, true); - ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q4_0, true); - - ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q8_0); - ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q8_0); - ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q8_0); - - ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q8_0, true); - ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q8_0, true); - ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q8_0, true); - - abort(); - - for (size_t i = 0; i < vals.size(); i += 3) { - ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0); - ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1); - ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2); - std::cerr << '\n'; - ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0); - ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1); - ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2); - std::cerr << '\n'; - ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0); - ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1); - ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2); - std::cerr << '\n' << std::endl; - - if (vals[i + 2] % 32 == 0) { - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0, GGML_TYPE_Q4_0); - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1, GGML_TYPE_Q4_0); - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2, GGML_TYPE_Q4_0); - std::cerr << '\n'; - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0, GGML_TYPE_Q4_0); - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1, GGML_TYPE_Q4_0); - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2, GGML_TYPE_Q4_0); - std::cerr << '\n'; - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0, GGML_TYPE_Q4_0); - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1, GGML_TYPE_Q4_0); - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2, GGML_TYPE_Q4_0); - std::cerr << '\n' << std::endl; - } - - if (vals[i + 2] % 256 == 0) { - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0, GGML_TYPE_Q4_K); - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1, GGML_TYPE_Q4_K); - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2, GGML_TYPE_Q4_K); - std::cerr << '\n'; - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0, GGML_TYPE_Q4_K); - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1, GGML_TYPE_Q4_K); - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2, GGML_TYPE_Q4_K); - std::cerr << '\n'; - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0, GGML_TYPE_Q4_K); - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1, GGML_TYPE_Q4_K); - ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2, GGML_TYPE_Q4_K); - std::cerr << '\n' << std::endl; - } - } - - GGML_ABORT("fatal error"); -#endif - - if (subctx) { - // Submit and wait for any pending work before reallocating the buffers - ggml_vk_ctx_end(subctx); - ggml_vk_submit(subctx, {}); - ctx->submit_pending = true; - ggml_vk_synchronize(ctx); - GGML_ASSERT(ctx->compute_ctx.expired()); - ggml_vk_ctx_begin(ctx->device, subctx); - ctx->compute_ctx = subctx; - } - - if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) { - VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")"); - // Resize buffer - if (ctx->prealloc_x != nullptr) { - ggml_vk_destroy_buffer(ctx->prealloc_x); - } - ctx->prealloc_x = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_x); - } - if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) { - VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")"); - // Resize buffer - if (ctx->prealloc_y != nullptr) { - ggml_vk_destroy_buffer(ctx->prealloc_y); - } - ctx->prealloc_y = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_y); - ctx->prealloc_y_last_pipeline_used = nullptr; - ctx->prealloc_y_last_tensor_used = nullptr; - ctx->prealloc_y_last_decode_vector_staging = false; - } - if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) { - VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")"); - // Resize buffer - if (ctx->prealloc_split_k != nullptr) { - ggml_vk_destroy_buffer(ctx->prealloc_split_k); - } - ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_split_k); - } - if (ctx->prealloc_add_rms_partials == nullptr || (ctx->prealloc_size_add_rms_partials > 0 && ctx->prealloc_add_rms_partials->size < ctx->prealloc_size_add_rms_partials)) { - VK_LOG_MEMORY("ggml_vk_preallocate_buffers(add_partials_size: " << ctx->prealloc_add_rms_partials << ")"); - // Resize buffer - if (ctx->prealloc_add_rms_partials != nullptr) { - ggml_vk_destroy_buffer(ctx->prealloc_add_rms_partials); - } - ctx->prealloc_add_rms_partials = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_add_rms_partials); - } -} - -static void ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool almost_ready); - -// Returns true if node has enqueued work into the queue, false otherwise -// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. -static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool last_node, bool almost_ready, bool submit){ - ggml_tensor * node = cgraph->nodes[node_idx]; - if (ggml_is_empty(node) || ggml_op_is_empty(node->op) || !node->buffer) { - return false; - } - if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { - return false; - } - - VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")"); - ctx->semaphore_idx = 0; - - ggml_tensor * src0 = node->src[0]; - ggml_tensor * src1 = node->src[1]; - ggml_tensor * src2 = node->src[2]; - ggml_tensor * src3 = node->src[3]; - - if (node->op == GGML_OP_ADD) { - int next_node_idx = node_idx + 1 + ctx->num_additional_fused_ops; - if (next_node_idx < cgraph->n_nodes && - cgraph->nodes[next_node_idx]->op == GGML_OP_RMS_NORM && - cgraph->nodes[next_node_idx]->src[0] == cgraph->nodes[next_node_idx - 1] && - ggml_nrows(cgraph->nodes[next_node_idx]) == 1 && - ctx->device->add_rms_fusion) { - uint32_t size = ggml_vk_rms_partials_size(ctx, cgraph->nodes[node_idx]); - ctx->do_add_rms_partials_offset_calculation = true; - if (ctx->prealloc_size_add_rms_partials_offset + size <= ctx->prealloc_size_add_rms_partials) { - ctx->do_add_rms_partials = true; - } - } - } - - vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); - - { - // This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers - // to synchronize them. This handles most "normal" synchronization when computing the graph, and when - // there is no auxiliary memory use, it shouldn't be necessary to call ggml_vk_sync_buffers - // outside of this logic. When a node uses one of the prealloc buffers for something like - // dequantization or split_k, additional synchronization is needed between those passes. - bool need_sync = false; - - // Check whether "node" requires synchronization. The node requires synchronization if it - // overlaps in memory with another unsynchronized node and at least one of them is a write. - // Destination nodes are checked against both the written/read lists. Source nodes are only - // checked against the written list. Two nodes overlap in memory if they come from the same - // buffer and the tensor or view ranges overlap. - auto const &overlaps_unsynced = [&](const ggml_tensor *node, const std::vector &unsynced_nodes) -> bool { - if (unsynced_nodes.size() == 0) { - return false; - } - auto n_base = vk_tensor_offset(node) + node->view_offs; - auto n_size = ggml_nbytes(node); - ggml_backend_vk_buffer_context * a_buf_ctx = (ggml_backend_vk_buffer_context *)node->buffer->context; - vk_buffer a_buf = a_buf_ctx->dev_buffer; - for (auto &other : unsynced_nodes) { - ggml_backend_vk_buffer_context * o_buf_ctx = (ggml_backend_vk_buffer_context *)other->buffer->context; - vk_buffer o_buf = o_buf_ctx->dev_buffer; - if (a_buf == o_buf) { - auto o_base = vk_tensor_offset(other) + other->view_offs; - auto o_size = ggml_nbytes(other); - - if ((o_base <= n_base && n_base < o_base + o_size) || - (n_base <= o_base && o_base < n_base + n_size)) { - return true; - } - } - } - return false; - }; - - // For all fused ops, check if the destination node or any of the source - // nodes require synchronization. - for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1 && !need_sync; ++i) { - const ggml_tensor *cur_node = cgraph->nodes[node_idx + i]; - // If the node actually writes to memory, then check if it needs to sync - if (ctx->fused_ops_write_mask & (1 << i)) { - if (overlaps_unsynced(cur_node, ctx->unsynced_nodes_read) || overlaps_unsynced(cur_node, ctx->unsynced_nodes_written)) { - need_sync = true; - break; - } - } - for (uint32_t j = 0; j < GGML_MAX_SRC; ++j) { - if (!cur_node->src[j]) { - continue; - } - if (overlaps_unsynced(cur_node->src[j], ctx->unsynced_nodes_written)) { - need_sync = true; - break; - } - } - } - - if (need_sync) { - if (vk_enable_sync_logger) { - std::cerr << "sync" << std::endl; - } - ctx->unsynced_nodes_written.clear(); - ctx->unsynced_nodes_read.clear(); - ggml_vk_sync_buffers(ctx, compute_ctx); - - if (vk_perf_logger_enabled && vk_perf_logger_concurrent) { - ctx->query_node_idx[ctx->query_idx] = node_idx; - compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); - ggml_vk_sync_buffers(ctx, compute_ctx); - } - } - // Add all fused nodes to the unsynchronized lists. - for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { - const ggml_tensor *cur_node = cgraph->nodes[node_idx + i]; - // Multiple outputs could be written, e.g. in topk_moe. Add them all to the list. - if (ctx->fused_ops_write_mask & (1 << i)) { - ctx->unsynced_nodes_written.push_back(cur_node); - } - for (uint32_t j = 0; j < GGML_MAX_SRC; ++j) { - if (!cur_node->src[j]) { - continue; - } - ctx->unsynced_nodes_read.push_back(cur_node->src[j]); - } - } - } - if (vk_enable_sync_logger) { - for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { - auto *n = cgraph->nodes[node_idx + i]; - std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name; - if (n->op == GGML_OP_GLU) { - std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " "; - } - if (n->op == GGML_OP_ROPE) { - const int mode = ((const int32_t *) n->op_params)[2]; - std::cerr << " rope mode: " << mode; - } - std::cerr << std::endl; - } - } - - switch (node->op) { - case GGML_OP_REPEAT: - ggml_vk_repeat(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_REPEAT_BACK: - ggml_vk_repeat_back(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_ACC: - case GGML_OP_SET: - ggml_vk_acc(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_GET_ROWS: - ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_ADD: - if (ctx->num_additional_fused_ops) { - ggml_vk_multi_add(ctx, compute_ctx, cgraph, node_idx); - } else { - ggml_vk_add(ctx, compute_ctx, src0, src1, node); - } - break; - case GGML_OP_SUB: - ggml_vk_sub(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_MUL: - if (ctx->num_additional_fused_ops) { - ggml_vk_snake_dispatch_fused(ctx, compute_ctx, cgraph, node_idx); - } else { - ggml_vk_mul(ctx, compute_ctx, src0, src1, node); - } - - break; - case GGML_OP_DIV: - ggml_vk_div(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_ADD_ID: - ggml_vk_add_id(ctx, compute_ctx, src0, src1, src2, node); - - break; - case GGML_OP_CONCAT: - ggml_vk_concat(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_UPSCALE: - ggml_vk_upscale(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_ADD1: - ggml_vk_add1(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_ARANGE: - ggml_vk_arange(ctx, compute_ctx, node); - - break; - case GGML_OP_FILL: - ggml_vk_fill(ctx, compute_ctx, node); - - break; - case GGML_OP_SCALE: - ggml_vk_scale(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_SQR: - ggml_vk_sqr(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_SQRT: - ggml_vk_sqrt(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_SIN: - ggml_vk_sin(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_COS: - ggml_vk_cos(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_LOG: - ggml_vk_log(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_TRI: - ggml_vk_tri(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_DIAG: - ggml_vk_diag(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_CLAMP: - ggml_vk_clamp(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_PAD: - ggml_vk_pad(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_ROLL: - ggml_vk_roll(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_CPY: - case GGML_OP_CONT: - case GGML_OP_DUP: - ggml_vk_cpy(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_SET_ROWS: - ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_SILU_BACK: - ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_NORM: - ggml_vk_norm(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_GROUP_NORM: - ggml_vk_group_norm(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_RMS_NORM: - ggml_vk_rms_norm(ctx, compute_ctx, cgraph, node_idx, (float *)node->op_params); - break; - case GGML_OP_RMS_NORM_BACK: - ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_L2_NORM: - ggml_vk_l2_norm(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_UNARY: - if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) { - ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx); - break; - } - - switch (ggml_get_unary_op(node)) { - case GGML_UNARY_OP_ELU: - case GGML_UNARY_OP_EXP: - case GGML_UNARY_OP_EXPM1: - case GGML_UNARY_OP_SILU: - case GGML_UNARY_OP_GELU: - case GGML_UNARY_OP_GELU_ERF: - case GGML_UNARY_OP_GELU_QUICK: - case GGML_UNARY_OP_RELU: - case GGML_UNARY_OP_NEG: - case GGML_UNARY_OP_TANH: - case GGML_UNARY_OP_SIGMOID: - case GGML_UNARY_OP_HARDSIGMOID: - case GGML_UNARY_OP_HARDSWISH: - case GGML_UNARY_OP_ABS: - case GGML_UNARY_OP_SOFTPLUS: - case GGML_UNARY_OP_STEP: - case GGML_UNARY_OP_ROUND: - case GGML_UNARY_OP_CEIL: - case GGML_UNARY_OP_FLOOR: - case GGML_UNARY_OP_TRUNC: - case GGML_UNARY_OP_SGN: - ggml_vk_unary(ctx, compute_ctx, src0, node); - break; - case GGML_UNARY_OP_XIELU: - ggml_vk_xielu(ctx, compute_ctx, src0, node); - break; - default: - return false; - } - break; - case GGML_OP_GLU: - switch (ggml_get_glu_op(node)) { - case GGML_GLU_OP_GEGLU: - case GGML_GLU_OP_REGLU: - case GGML_GLU_OP_SWIGLU: - case GGML_GLU_OP_SWIGLU_OAI: - case GGML_GLU_OP_GEGLU_ERF: - case GGML_GLU_OP_GEGLU_QUICK: - ggml_vk_glu(ctx, compute_ctx, src0, src1, node); - break; - default: - return false; - } - break; - case GGML_OP_DIAG_MASK_INF: - ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_SOFT_MAX: - if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) { - ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx); - } else { - ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node); - } - - break; - case GGML_OP_SOFT_MAX_BACK: - ggml_vk_soft_max_back(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_ROPE: - ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, false); - - break; - case GGML_OP_ROPE_BACK: - ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, true); - - break; - case GGML_OP_ARGSORT: - if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) { - ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx); - } else { - ggml_vk_argsort(ctx, compute_ctx, src0, node); - } - - break; - case GGML_OP_TOP_K: - ggml_vk_topk(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_SUM: - ggml_vk_sum(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_SUM_ROWS: - ggml_vk_sum_rows(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_CUMSUM: - ggml_vk_cumsum(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_MEAN: - ggml_vk_mean(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_ARGMAX: - ggml_vk_argmax(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_COUNT_EQUAL: - ggml_vk_count_equal(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_SOLVE_TRI: - ggml_vk_solve_tri(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_IM2COL: - ggml_vk_im2col(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_IM2COL_3D: - ggml_vk_im2col_3d(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_TIMESTEP_EMBEDDING: - ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_COL2IM_1D: - ggml_vk_col2im_1d(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_CONV_TRANSPOSE_1D: - ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_POOL_2D: - ggml_vk_pool_2d(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_CONV_2D: - case GGML_OP_CONV_TRANSPOSE_2D: - ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_CONV_2D_DW: - ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node); - - break; - case GGML_OP_LEAKY_RELU: - ggml_vk_leaky_relu(ctx, compute_ctx, src0, node); - - break; - case GGML_OP_MUL_MAT: - ggml_vk_mul_mat(ctx, compute_ctx, cgraph, node_idx); - - break; - case GGML_OP_MUL_MAT_ID: - ggml_vk_mul_mat_id(ctx, compute_ctx, cgraph, node_idx); - - break; - - case GGML_OP_FLASH_ATTN_EXT: - ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node->src[4], node); - - break; - - case GGML_OP_RWKV_WKV6: - ggml_vk_rwkv_wkv6(ctx, compute_ctx, node); - - break; - - case GGML_OP_RWKV_WKV7: - ggml_vk_rwkv_wkv7(ctx, compute_ctx, node); - - break; - - case GGML_OP_GATED_DELTA_NET: - ggml_vk_gated_delta_net(ctx, compute_ctx, node); - - break; - - case GGML_OP_SSM_SCAN: - ggml_vk_ssm_scan(ctx, compute_ctx, node); - - break; - - case GGML_OP_SSM_CONV: - ggml_vk_ssm_conv(ctx, compute_ctx, cgraph, node_idx); - - break; - - case GGML_OP_OPT_STEP_ADAMW: - ggml_vk_opt_step_adamw(ctx, compute_ctx, node); - - break; - - case GGML_OP_OPT_STEP_SGD: - ggml_vk_opt_step_sgd(ctx, compute_ctx, src0, src1, src2, node); - - break; - default: - return false; - } - - ctx->tensor_ctxs[node_idx] = compute_ctx; - -#if defined(GGML_VULKAN_CHECK_RESULTS) - // Force context reset on each node so that each tensor ends up in its own context - // and can be run and compared to its CPU equivalent separately - last_node = true; -#endif - - if (submit || last_node) { - ggml_vk_ctx_end(compute_ctx); - - // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward - if (last_node) { - compute_ctx->exit_tensor_idx = node_idx_begin; - } - else { - compute_ctx->exit_tensor_idx = -1; - } - - ctx->compute_ctx.reset(); - - ggml_vk_compute_forward(ctx, cgraph, node_begin, node_idx_begin, almost_ready); - } - return true; -} - -static void ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, ggml_tensor * tensor, int tensor_idx, bool almost_ready = false) { - GGML_UNUSED(cgraph); - GGML_UNUSED(tensor); - - VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")"); - - vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock(); - - // Only run if ctx hasn't been submitted yet - if (!subctx->seqs.empty()) { -#ifdef GGML_VULKAN_CHECK_RESULTS - ggml_vk_check_results_0(ctx, cgraph, tensor_idx); -#endif - - // Do staging buffer copies - for (auto& cpy : subctx->in_memcpys) { - memcpy(cpy.dst, cpy.src, cpy.n); - } - - for (auto& mset : subctx->memsets) { - memset(mset.dst, mset.val, mset.n); - } - - if (almost_ready && !ctx->almost_ready_fence_pending) { - ggml_vk_submit(subctx, ctx->almost_ready_fence); - ctx->almost_ready_fence_pending = true; - } else { - ggml_vk_submit(subctx, {}); - } - ctx->submit_pending = true; - -#ifdef GGML_VULKAN_CHECK_RESULTS - ggml_vk_synchronize(ctx); - ggml_vk_check_results_1(ctx, cgraph, tensor_idx); -#endif - } - - if (tensor_idx == subctx->exit_tensor_idx) { - // Do staging buffer copies - for (auto& cpy : subctx->out_memcpys) { - memcpy(cpy.dst, cpy.src, cpy.n); - } - subctx->in_memcpys.clear(); - subctx->out_memcpys.clear(); - subctx->memsets.clear(); - } -} - -// Clean up after graph processing is done -static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { - VK_LOG_DEBUG("ggml_vk_graph_cleanup()"); - ctx->prealloc_y_last_pipeline_used = {}; - ctx->prealloc_y_last_tensor_used = nullptr; - ctx->prealloc_y_last_decode_vector_staging = false; - - ctx->unsynced_nodes_written.clear(); - ctx->unsynced_nodes_read.clear(); - ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false; - - ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool); - if (ctx->device->async_use_transfer_queue) { - ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool); - } - - for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) { - ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s }); - } - ctx->gc.semaphores.clear(); - - for (size_t i = 0; i < ctx->gc.tl_semaphores.size(); i++) { - ctx->device->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s }); - } - ctx->gc.tl_semaphores.clear(); - ctx->semaphore_idx = 0; - - ctx->event_idx = 0; - - for (auto& event : ctx->gc.events) { - ctx->device->device.resetEvent(event); - } - - ctx->tensor_ctxs.clear(); - ctx->gc.contexts.clear(); - ctx->pipeline_descriptor_set_requirements = 0; - ctx->descriptor_set_idx = 0; -} - -// Clean up on backend free -static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { - VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->name << ")"); - // discard any unsubmitted command buffers - ctx->compute_ctx.reset(); - // wait for any pending command buffers to finish - ggml_vk_synchronize(ctx); - - ggml_vk_graph_cleanup(ctx); - - ggml_vk_destroy_buffer(ctx->prealloc_x); - ggml_vk_destroy_buffer(ctx->prealloc_y); - ggml_vk_destroy_buffer(ctx->prealloc_split_k); - ggml_vk_destroy_buffer(ctx->prealloc_add_rms_partials); - ggml_vk_destroy_buffer(ctx->sync_staging); - - ctx->prealloc_y_last_pipeline_used = nullptr; - ctx->prealloc_y_last_tensor_used = nullptr; - ctx->prealloc_y_last_decode_vector_staging = false; - - ctx->prealloc_size_x = 0; - ctx->prealloc_size_y = 0; - ctx->prealloc_size_split_k = 0; - - for (auto& event : ctx->gc.events) { - ctx->device->device.destroyEvent(event); - } - ctx->gc.events.clear(); - - ctx->device->device.destroyFence(ctx->fence); - ctx->device->device.destroyFence(ctx->almost_ready_fence); - - for (auto& pool : ctx->descriptor_pools) { - ctx->device->device.destroyDescriptorPool(pool); - } - ctx->descriptor_pools.clear(); - ctx->descriptor_sets.clear(); - - ctx->compute_cmd_pool.destroy(ctx->device->device); - if (ctx->device->async_use_transfer_queue) { - ctx->device->device.destroySemaphore(ctx->transfer_semaphore.s); - - ctx->transfer_cmd_pool.destroy(ctx->device->device); - } - if (vk_perf_logger_enabled) { - ctx->perf_logger->print_timings(true); - } -} - -static int ggml_vk_get_device_count() { - ggml_vk_instance_init(); - - return vk_instance.device_indices.size(); -} - -static void ggml_vk_get_device_description(int device, char * description, size_t description_size) { - ggml_vk_instance_init(); - - std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); - - vk::PhysicalDeviceProperties props; - devices[device].getProperties(&props); - - snprintf(description, description_size, "%s", props.deviceName.data()); -} - -// backend interface - -#define UNUSED GGML_UNUSED - -// device backend - -static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { - return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name; -} - -static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { - VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()"); - ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; - ggml_vk_destroy_buffer(ctx->dev_buffer); - delete ctx; -} - -static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { - return vk_ptr_base; - - UNUSED(buffer); -} - -static enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); - if (tensor->view_src != nullptr) { - GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); - } - return GGML_STATUS_SUCCESS; -} - -static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { - VK_LOG_DEBUG("ggml_backend_vk_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")"); - ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; - vk_buffer buf = buf_ctx->dev_buffer; - - if (size == 0) { - return; - } - - uint32_t val32 = (uint32_t)value * 0x01010101; - ggml_vk_buffer_memset(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, val32, size); -} - -static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); - ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; - vk_buffer buf = buf_ctx->dev_buffer; - - if (size == 0) { - return; - } - - ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); -} - -static void ggml_backend_vk_buffer_set_tensor_2d(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, - size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) { - VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor_2d(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ", " << - n_copies << ", " << stride_tensor << ", " << stride_data << ")"); - ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; - vk_buffer buf = buf_ctx->dev_buffer; - - if (size == 0) { - return; - } - - ggml_vk_buffer_write_2d(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, stride_data, stride_tensor, size, n_copies); -} - -static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); - ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; - - if (size == 0) { - return; - } - - vk_buffer buf = buf_ctx->dev_buffer; - - ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); -} - -static void ggml_backend_vk_buffer_get_tensor_2d(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, - size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) { - VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor_2d(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ", " << - n_copies << ", " << stride_tensor << ", " << stride_data << ")"); - ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; - - if (size == 0) { - return; - } - - vk_buffer buf = buf_ctx->dev_buffer; - - ggml_vk_buffer_read_2d(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, stride_tensor, stride_data, size, n_copies); -} - -static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { - if (ggml_nbytes(src) == 0) { - return true; - } - - if (ggml_backend_buffer_is_vk(src->buffer)) { - ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; - ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; - - vk_buffer src_buf = src_buf_ctx->dev_buffer; - vk_buffer dst_buf = dst_buf_ctx->dev_buffer; - - ggml_vk_buffer_copy(dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src)); - - return true; - } - return false; - - UNUSED(buffer); -} - -static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; - - ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); -} - -static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { - /* .free_buffer = */ ggml_backend_vk_buffer_free_buffer, - /* .get_base = */ ggml_backend_vk_buffer_get_base, - /* .init_tensor = */ ggml_backend_vk_buffer_init_tensor, - /* .memset_tensor = */ ggml_backend_vk_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_vk_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_vk_buffer_get_tensor, - /* .set_tensor_2d = */ ggml_backend_vk_buffer_set_tensor_2d, - /* .get_tensor_2d = */ ggml_backend_vk_buffer_get_tensor_2d, - /* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor, - /* .clear = */ ggml_backend_vk_buffer_clear, - /* .reset = */ NULL, -}; - -// vk buffer type -static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) { - ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context; - - return ctx->name.c_str(); -} - -static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")"); - ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; - - vk_buffer dev_buffer = nullptr; - try { - dev_buffer = ggml_vk_create_buffer_device(ctx->device, size); - } catch (const vk::SystemError& e) { - return nullptr; - } - - ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->device, std::move(dev_buffer), ctx->name); - - return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size); -} - -static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; - return ctx->device->properties.limits.minStorageBufferOffsetAlignment; -} - -static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; - return ctx->device->suballocation_block_size; -} - -static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - return ggml_nbytes(tensor); - - UNUSED(buft); -} - -ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { - ggml_vk_instance_init(); - - VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")"); - - vk_device dev = ggml_vk_get_device(dev_num); - - return &dev->buffer_type; -} - -// host buffer type - -static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) { - return GGML_VK_NAME "_Host"; - - UNUSED(buft); -} - -static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { - VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); - ggml_vk_host_free(vk_instance.devices[0], buffer->context); -} - -static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")"); - - size += 32; // Behave like the CPU buffer type - void * ptr = nullptr; - try { - ptr = ggml_vk_host_malloc(vk_instance.devices[0], size); - } catch (vk::SystemError& e) { - GGML_LOG_WARN("ggml_vulkan: Failed to allocate pinned memory (%s)\n", e.what()); - // fallback to cpu buffer - return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); - } - - ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); - buffer->buft = buft; - buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer; - - return buffer; - - UNUSED(buft); -} - -static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return vk_instance.devices[0]->properties.limits.minMemoryMapAlignment; - - UNUSED(buft); -} - -static size_t ggml_backend_vk_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - return vk_instance.devices[0]->suballocation_block_size; - - UNUSED(buft); -} - -// Should be changed to return device-specific host buffer type -// but that probably requires changes in llama.cpp -ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { - static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = { - /* .iface = */ { - /* .get_name = */ ggml_backend_vk_host_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_vk_host_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_vk_host_buffer_type_get_max_size, - /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, - }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), 0), - /* .context = */ nullptr, - }; - - // Make sure device 0 is initialized - ggml_vk_instance_init(); - ggml_vk_get_device(0); - - return &ggml_backend_vk_buffer_type_host; -} - - -// backend - -static const char * ggml_backend_vk_name(ggml_backend_t backend) { - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - - return ctx->name.c_str(); -} - -static void ggml_backend_vk_free(ggml_backend_t backend) { - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")"); - - ggml_vk_cleanup(ctx); - - delete ctx; - delete backend; -} - -static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - - return &ctx->device->buffer_type; -} - -static void ggml_backend_vk_set_tensor_2d_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, - size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) { - VK_LOG_DEBUG("ggml_backend_vk_set_tensor_2d_async(" << size << ", " << n_copies << ")"); - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); - - if (size == 0) { - return; - } - - ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; - - vk_context cpy_ctx; - - if (ctx->device->async_use_transfer_queue) { - if (ctx->transfer_ctx.expired()) { - cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool); - ctx->transfer_ctx = cpy_ctx; - ggml_vk_ctx_begin(ctx->device, cpy_ctx); - } else { - cpy_ctx = ctx->transfer_ctx.lock(); - } - } else { - cpy_ctx = ggml_vk_get_compute_ctx(ctx); - } - - vk_buffer buf = buf_ctx->dev_buffer; - - auto dst_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset; - - bool ret = ggml_vk_buffer_write_2d_async(cpy_ctx, buf, dst_offset, data, stride_data, stride_tensor, size, n_copies); - - if (!ret) { - const size_t staging_size = size * n_copies; - ggml_vk_ensure_sync_staging_buffer(ctx, staging_size); - ggml_vk_sync_buffers(nullptr, cpy_ctx); - - std::vector slices(1); - if (size == stride_tensor) { - slices[0].srcOffset = 0; - slices[0].dstOffset = dst_offset; - slices[0].size = staging_size; - } else { - slices.resize(n_copies); - for (size_t i = 0; i < n_copies; i++) { - slices[i].srcOffset = i * size; - slices[i].dstOffset = dst_offset + i * stride_tensor; - slices[i].size = size; - } - } - - cpy_ctx->s->buffer->buf.copyBuffer(ctx->sync_staging->buffer, buf->buffer, slices); - - if (size == stride_data) { - deferred_memcpy(ctx->sync_staging->ptr, data, staging_size, &cpy_ctx->in_memcpys); - } else { - for (size_t i = 0; i < n_copies; i++) { - deferred_memcpy((uint8_t *)ctx->sync_staging->ptr + i * size, (const uint8_t *)data + i * stride_data, size, &cpy_ctx->in_memcpys); - } - } - ggml_vk_synchronize(ctx); - } -} - -static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")"); - ggml_backend_vk_set_tensor_2d_async(backend, tensor, data, offset, size, 1, size, size); -} - -static void ggml_backend_vk_get_tensor_2d_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, - size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) { - VK_LOG_DEBUG("ggml_backend_vk_get_tensor_2d_async(" << size << ", " << n_copies << ")"); - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); - - if (size == 0) { - return; - } - - ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; - - vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); - - vk_buffer buf = buf_ctx->dev_buffer; - - auto src_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset; - bool ret = ggml_vk_buffer_read_2d_async(compute_ctx, buf, src_offset, data, stride_tensor, stride_data, size, n_copies); - - if (!ret) { - const size_t staging_size = size * n_copies; - ggml_vk_ensure_sync_staging_buffer(ctx, staging_size); - ggml_vk_sync_buffers(nullptr, compute_ctx); - - std::vector slices(1); - if (size == stride_tensor) { - slices[0].srcOffset = src_offset; - slices[0].dstOffset = 0; - slices[0].size = staging_size; - } else { - slices.resize(n_copies); - for (size_t i = 0; i < n_copies; i++) { - slices[i].srcOffset = src_offset + i * stride_tensor; - slices[i].dstOffset = i * size; - slices[i].size = size; - } - } - - compute_ctx->s->buffer->buf.copyBuffer(buf->buffer, ctx->sync_staging->buffer, slices); - - if (size == stride_data) { - deferred_memcpy(data, ctx->sync_staging->ptr, staging_size, &compute_ctx->out_memcpys); - } else { - for (size_t i = 0; i < n_copies; i++) { - deferred_memcpy((uint8_t *)data + i * stride_data, (const uint8_t *)ctx->sync_staging->ptr + i * size, size, &compute_ctx->out_memcpys); - } - } - ggml_vk_synchronize(ctx); - } -} - -static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")"); - ggml_backend_vk_get_tensor_2d_async(backend, tensor, data, offset, size, 1, size, size); -} - -static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { - VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_nbytes(src) << ")"); - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context; - - // Skip zero-size tensors - if (ggml_nbytes(src) == 0) { - return true; - } - - if (dst->buffer->buft != ggml_backend_vk_get_default_buffer_type(backend_dst)) { - return false; - } - - ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; - vk_buffer dst_buf = dst_buf_ctx->dev_buffer; - - if (ggml_backend_buffer_is_vk(src->buffer)) { - ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; - - // Async copy only works within the same device - if (src_buf_ctx->dev_buffer->device != dst_buf->device) { - return false; - } - - vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); - - ggml_vk_buffer_copy_async(compute_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, - src_buf_ctx->dev_buffer, vk_tensor_offset(src) + src->view_offs, - ggml_nbytes(src)); - return true; - } - - if (ggml_backend_buffer_is_host(src->buffer)) { - vk_buffer pinned_buf = nullptr; - size_t pinned_offset = 0; - ggml_vk_host_get(ctx->device, src->data, pinned_buf, pinned_offset); - if (pinned_buf == nullptr) { - return false; - } - - vk_context cpy_ctx; - if (ctx->device->async_use_transfer_queue) { - if (ctx->transfer_ctx.expired()) { - cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool); - ctx->transfer_ctx = cpy_ctx; - ggml_vk_ctx_begin(ctx->device, cpy_ctx); - } else { - cpy_ctx = ctx->transfer_ctx.lock(); - } - } else { - cpy_ctx = ggml_vk_get_compute_ctx(ctx); - } - - return ggml_vk_buffer_write_async(cpy_ctx, dst_buf, - vk_tensor_offset(dst) + dst->view_offs, - src->data, ggml_nbytes(src)); - } - - GGML_UNUSED(backend_src); - return false; -} - -static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) { - VK_LOG_DEBUG("ggml_vk_synchronize()"); - - bool do_transfer = !ctx->compute_ctx.expired(); - - if (ggml_vk_submit_transfer_ctx(ctx)) { - ctx->submit_pending = true; - } - - vk_context compute_ctx; - vk_command_buffer* cmd_buf = nullptr; - if (do_transfer) { - compute_ctx = ctx->compute_ctx.lock(); - if (compute_ctx->s) { - cmd_buf = compute_ctx->s->buffer; - } - - ggml_vk_ctx_end(compute_ctx); - - for (auto& cpy : compute_ctx->in_memcpys) { - memcpy(cpy.dst, cpy.src, cpy.n); - } - - ggml_vk_submit(compute_ctx, {}); - ctx->submit_pending = true; - } - - if (ctx->submit_pending) { - if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) { - vk::TimelineSemaphoreSubmitInfo tl_info{ - 1, &ctx->transfer_semaphore.value, - 0, nullptr, - }; - vk::PipelineStageFlags stage = ctx->device->transfer_queue.stage_flags; - vk::SubmitInfo si{ - 1, &ctx->transfer_semaphore.s, &stage, - 0, nullptr, - 0, nullptr, - }; - si.setPNext(&tl_info); - std::lock_guard guard(queue_mutex); - ctx->device->compute_queue.queue.submit({ si }, ctx->fence); - ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value; - } else { - std::lock_guard guard(queue_mutex); - ctx->device->compute_queue.queue.submit({}, ctx->fence); - } - ggml_vk_wait_for_fence(ctx); - ctx->submit_pending = false; - if (cmd_buf) { - cmd_buf->in_use = false; - cmd_buf->buf.reset(); - } - } - - if (do_transfer) { - for (auto& cpy : compute_ctx->out_memcpys) { - memcpy(cpy.dst, cpy.src, cpy.n); - } - ctx->compute_ctx.reset(); - } -} - -static void ggml_backend_vk_synchronize(ggml_backend_t backend) { - VK_LOG_DEBUG("ggml_backend_vk_synchronize()"); - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - - ggml_vk_synchronize(ctx); - - ggml_vk_graph_cleanup(ctx); -} - -static bool ggml_vk_is_empty(ggml_tensor * node) { - return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; -} - -static bool ggml_vk_can_fuse(const ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list ops) { - if (!ggml_can_fuse(cgraph, node_idx, ops)) { - return false; - } - - if (ops.size() == 2 && ops.begin()[0] == GGML_OP_RMS_NORM && ops.begin()[1] == GGML_OP_MUL) { - // additional constraints specific to this fusion - const ggml_tensor *rms_norm = cgraph->nodes[node_idx]; - const ggml_tensor *mul = cgraph->nodes[node_idx + 1]; - - GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(rms_norm->type == GGML_TYPE_F32); - // rms_norm only supports f32 - if (mul->src[0]->type != GGML_TYPE_F32 || - mul->src[1]->type != GGML_TYPE_F32 || - mul->type != GGML_TYPE_F32) { - return false; - } - // if rms_norm is the B operand, then we don't handle broadcast - if (rms_norm == mul->src[1] && - !ggml_are_same_shape(mul->src[0], rms_norm)) { - return false; - } - // rms_norm shader assumes contiguous rows - if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) { - return false; - } - } - auto const &mm_add_ok = [&](const ggml_tensor *mul, const ggml_tensor *add) { - const ggml_tensor *bias = add->src[0] == mul ? add->src[1] : add->src[0]; - - // mat-vec only - if (ggml_nrows(mul) != 1) { - return false; - } - // shaders assume the types match - if (mul->type != bias->type) { - return false; - } - // shaders reuse the D shape for bias - if (!ggml_are_same_shape(mul, bias) || - !ggml_are_same_stride(mul, bias)) { - return false; - } - // unaligned bias isn't handled - if (get_misalign_bytes(ctx, bias) != 0) { - return false; - } - return true; - }; - - if ((ops.size() == 2 || ops.size() == 3) && ops.begin()[0] == GGML_OP_MUL_MAT && ops.begin()[1] == GGML_OP_ADD) { - // additional constraints specific to this fusion - const ggml_tensor *mul = cgraph->nodes[node_idx]; - const ggml_tensor *add = cgraph->nodes[node_idx + 1]; - - if (!mm_add_ok(mul, add)) { - return false; - } - if (ops.size() == 3) { - if (ops.begin()[2] != GGML_OP_ADD) { - return false; - } - if (!mm_add_ok(add, cgraph->nodes[node_idx + 2])) { - return false; - } - } - } - - auto const &mmid_mul_ok = [&](const ggml_tensor *mmid, const ggml_tensor *mul) { - const ggml_tensor *scale = mul->src[1]; - - if (mmid != mul->src[0]) { - return false; - } - // mat-vec only - if (!ggml_vk_use_mul_mat_vec_id(cgraph, node_idx)) { - return false; - } - // shaders assume the types match - if (mmid->type != scale->type) { - return false; - } - // shaders assume the bias is contiguous - if (!ggml_is_contiguous(scale)) { - return false; - } - // unaligned bias isn't handled - if (get_misalign_bytes(ctx, scale) != 0) { - return false; - } - // shader only indexes by expert index - if (scale->ne[0] != 1 || - scale->ne[1] != mul->ne[1] || - scale->ne[2] != 1 || - scale->ne[3] != 1) { - return false; - } - return true; - }; - - if ((ops.size() == 2 || ops.size() == 3) && ops.begin()[0] == GGML_OP_MUL_MAT_ID && ops.begin()[1] == GGML_OP_ADD_ID) { - // additional constraints specific to this fusion - const ggml_tensor *mul = cgraph->nodes[node_idx]; - const ggml_tensor *add = cgraph->nodes[node_idx + 1]; - const ggml_tensor *bias = add->src[1]; - - if (mul != add->src[0]) { - return false; - } - // mat-vec only - if (!ggml_vk_use_mul_mat_vec_id(cgraph, node_idx)) { - return false; - } - // shaders assume the types match - if (mul->type != bias->type) { - return false; - } - // shaders assume the bias is contiguous - if (!ggml_is_contiguous(bias)) { - return false; - } - // the ID tensor must be the same for mul_mat_id and add_id - if (mul->src[2] != add->src[2]) { - return false; - } - // unaligned bias isn't handled - if (get_misalign_bytes(ctx, bias) != 0) { - return false; - } - - if (ops.size() == 3) { - if (ops.begin()[2] != GGML_OP_MUL) { - return false; - } - const ggml_tensor *mul = cgraph->nodes[node_idx + 2]; - return mmid_mul_ok(add, mul); - } - } - - if (ops.size() == 2 && ops.begin()[0] == GGML_OP_MUL_MAT_ID && ops.begin()[1] == GGML_OP_MUL) { - // additional constraints specific to this fusion - const ggml_tensor *mmid = cgraph->nodes[node_idx]; - const ggml_tensor *mul = cgraph->nodes[node_idx + 1]; - - if (!mmid_mul_ok(mmid, mul)) { - return false; - } - } - - return true; -} - -// Match SSM_CONV + UNARY(SILU) or SSM_CONV + ADD + UNARY(SILU). num_extra is 1 or 2. -static bool ggml_vk_can_fuse_ssm_conv(const ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, - int node_idx, int num_extra) { - const ggml_tensor * conv = cgraph->nodes[node_idx]; - if (conv->op != GGML_OP_SSM_CONV) { - return false; - } - - const ggml_tensor * silu = nullptr; - const ggml_tensor * bias = nullptr; - - if (num_extra == 1) { - if (!ggml_can_fuse(cgraph, node_idx, { GGML_OP_SSM_CONV, GGML_OP_UNARY })) { - return false; - } - silu = cgraph->nodes[node_idx + 1]; - } else if (num_extra == 2) { - if (!ggml_can_fuse(cgraph, node_idx, { GGML_OP_SSM_CONV, GGML_OP_ADD, GGML_OP_UNARY })) { - return false; - } - const ggml_tensor * add = cgraph->nodes[node_idx + 1]; - silu = cgraph->nodes[node_idx + 2]; - bias = (add->src[0] == conv) ? add->src[1] : add->src[0]; - - if (bias->type != GGML_TYPE_F32 || !ggml_is_contiguous(bias)) { - return false; - } - // bias must be channel-wise (one element per channel of the conv output) - if (ggml_nelements(bias) != conv->ne[0] || bias->ne[0] != conv->ne[0]) { - return false; - } - if (add->type != GGML_TYPE_F32) { - return false; - } - // The shader doesn't apply per-tensor offsets, so reject misaligned bias. - if (get_misalign_bytes(ctx, bias) != 0) { - return false; - } - } else { - return false; - } - - if (ggml_get_unary_op(silu) != GGML_UNARY_OP_SILU) { - return false; - } - if (conv->type != GGML_TYPE_F32 || silu->type != GGML_TYPE_F32) { - return false; - } - // The shader writes to the fused dst using its own strides, but the push constants don't - // carry a per-tensor offset, so the binding must be naturally aligned. - if (get_misalign_bytes(ctx, silu) != 0) { - return false; - } - return true; -} - -static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, - int node_idx, topk_moe_mode mode) { - - const ggml_tensor * softmax; - const ggml_tensor * weights; - const ggml_tensor * get_rows; - const ggml_tensor * argsort; - - switch (mode) { - case TOPK_MOE_EARLY_SOFTMAX_NORM: - softmax = cgraph->nodes[node_idx + 0]; - weights = cgraph->nodes[node_idx + 9]; - get_rows = cgraph->nodes[node_idx + 4]; - argsort = cgraph->nodes[node_idx + 2]; - break; - case TOPK_MOE_SIGMOID_NORM_BIAS: - softmax = cgraph->nodes[node_idx + 0]; // really sigmoid - weights = cgraph->nodes[node_idx + 10]; - get_rows = cgraph->nodes[node_idx + 5]; - argsort = cgraph->nodes[node_idx + 3]; - if (ggml_get_unary_op(softmax) != GGML_UNARY_OP_SIGMOID) { - return false; - } - // bias is expected to be 1D - if (ggml_nrows(cgraph->nodes[node_idx + 2]->src[1]) != 1 || - !ggml_is_contiguous(cgraph->nodes[node_idx + 2]->src[1])) { - return false; - } - // sigmoid fusion seems to generate infinities on moltenvk - if (ctx->device->driver_id == vk::DriverId::eMoltenvk) { - return false; - } - break; - case TOPK_MOE_EARLY_SOFTMAX: - softmax = cgraph->nodes[node_idx + 0]; - weights = cgraph->nodes[node_idx + 4]; - get_rows = cgraph->nodes[node_idx + 4]; - argsort = cgraph->nodes[node_idx + 2]; - break; - case TOPK_MOE_LATE_SOFTMAX: - softmax = cgraph->nodes[node_idx + 4]; - weights = cgraph->nodes[node_idx + 5]; - get_rows = cgraph->nodes[node_idx + 2]; - argsort = cgraph->nodes[node_idx + 0]; - break; - default: - return false; - } - - ggml_tensor * probs = get_rows->src[0]; - if (probs->op != GGML_OP_RESHAPE) { - return false; - } - probs = probs->src[0]; - ggml_tensor * selection_probs = argsort->src[0]; - - if (probs != selection_probs && mode != TOPK_MOE_SIGMOID_NORM_BIAS) { - return false; - } - - if (!ggml_is_contiguous(softmax->src[0]) || !ggml_is_contiguous(weights)) { - return false; - } - - if (softmax->op == GGML_OP_SOFT_MAX) { - const float * op_params = (const float *)softmax->op_params; - - float scale = op_params[0]; - float max_bias = op_params[1]; - - if (scale != 1.0f || max_bias != 0.0f) { - return false; - } - - // don't fuse when masks or sinks are present - if (softmax->src[1] || softmax->src[2]) { - return false; - } - } - - const int n_expert = softmax->ne[0]; - if (n_expert > (1 << (num_topk_moe_pipelines-1))) { - return false; - } - - if (!ctx->device->subgroup_arithmetic || - !ctx->device->subgroup_shuffle || - !ctx->device->subgroup_require_full_support || - ctx->device->disable_fusion) { - return false; - } - - return true; -} - -static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, - int node_idx) { - GGML_UNUSED(ctx); - const ggml_tensor *rope = cgraph->nodes[node_idx + 0]; - const ggml_tensor *view = cgraph->nodes[node_idx + 1]; - const ggml_tensor *set_rows = cgraph->nodes[node_idx + 2]; - - // ne3 not tested - if (rope->src[0]->ne[3] != 1) { - return false; - } - - if (set_rows->type != GGML_TYPE_F32 && set_rows->type != GGML_TYPE_F16) { - return false; - } - - if (set_rows->src[1]->type != GGML_TYPE_I64) { - return false; - } - - // The view should flatten two dims of rope into one dim - if (!ggml_is_contiguous(view) || - view->ne[0] != rope->ne[0] * rope->ne[1]) { - return false; - } - - // Only norm/neox/mrope shaders have the fusion code - const int mode = ((const int32_t *) rope->op_params)[2]; - if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_MROPE) { - return false; - } - - return true; -} - -// Pattern check for the 5-op Snake fusion: mul -> sin -> sqr -> mul -> add. -// Verifies the chain shape, the closure x_in_add == x_in_mul0, and that -// the broadcast operands a and inv_b share a [1, C] layout. -static bool ggml_vk_can_fuse_snake(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) { - GGML_UNUSED(ctx); - if (!ggml_can_fuse(cgraph, node_idx, snake_pattern)) { - return false; - } - - const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0]; - const ggml_tensor * sin_node = cgraph->nodes[node_idx + 1]; - const ggml_tensor * sqr = cgraph->nodes[node_idx + 2]; - const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3]; - const ggml_tensor * add = cgraph->nodes[node_idx + 4]; - - const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1]; - const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0]; - - const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0]; - const ggml_tensor * x_in_add = (add->src[0] == mul1) ? add->src[1] : add->src[0]; - - if (x_in_add != x) { - return false; - } - if (x->type != GGML_TYPE_F32 && x->type != GGML_TYPE_F16 && x->type != GGML_TYPE_BF16) { - return false; - } - // Shader bindings: data_a is A_TYPE so it follows x's precision, while - // data_b and data_c are hardcoded float, so the broadcast operands must - // be F32 regardless of x's type. - if (a->type != GGML_TYPE_F32) return false; - if (inv_b->type != GGML_TYPE_F32) return false; - // Chain intermediates and output share x's precision (single A_TYPE / D_TYPE pipeline). - if (mul0->type != x->type) return false; - if (sin_node->type != x->type) return false; - if (sqr->type != x->type) return false; - if (mul1->type != x->type) return false; - if (add->type != x->type) return false; - if (!ggml_are_same_shape(a, inv_b)) { - return false; - } - if (a->ne[0] != 1 || a->ne[1] != x->ne[1]) { - return false; - } - // Dispatch is 2D over (ne0, ne1), so x and add must be 2D and a / inv_b - // must collapse to [1, C, 1, 1]. Higher dims are not handled by the shader. - if (x->ne[2] != 1 || x->ne[3] != 1) return false; - if (add->ne[2] != 1 || add->ne[3] != 1) return false; - if (a->ne[2] != 1 || a->ne[3] != 1) return false; - if (inv_b->ne[2] != 1 || inv_b->ne[3] != 1) return false; - // Shader uses idx = i0 + i1 * ne0 and reads data_b[i1] / data_c[i1], - // so every operand must be contiguous. - if (!ggml_is_contiguous(x) || !ggml_is_contiguous(add) || - !ggml_is_contiguous(a) || !ggml_is_contiguous(inv_b)) { - return false; - } - return true; -} - -// Check whether the tensors overlap in memory. -// Fusions can potentially overwrite src tensors in ways that are not prevented -// by ggml-alloc. If the fusion src is being applied in a way that's elementwise -// with the destination, then it's OK for them to overlap if they are exactly equal. -static bool ggml_vk_tensors_overlap(const ggml_tensor * a, const ggml_tensor * b, bool elementwise) { - ggml_backend_vk_buffer_context * a_buf_ctx = (ggml_backend_vk_buffer_context *)a->buffer->context; - vk_buffer a_buf = a_buf_ctx->dev_buffer; - ggml_backend_vk_buffer_context * b_buf_ctx = (ggml_backend_vk_buffer_context *)b->buffer->context; - vk_buffer b_buf = b_buf_ctx->dev_buffer; - if (a_buf == b_buf) { - auto a_base = vk_tensor_offset(a) + a->view_offs; - auto a_size = ggml_nbytes(a); - auto b_base = vk_tensor_offset(b) + b->view_offs; - auto b_size = ggml_nbytes(b); - - if (elementwise && a_base == b_base && a_size == b_size) { - return false; - } - - if ((b_base <= a_base && a_base < b_base + b_size) || - (a_base <= b_base && b_base < a_base + a_size)) { - return true; - } - } - return false; -} - -static bool ggml_vk_can_fuse_rms_norm_mul_rope(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, - int node_idx) { - GGML_UNUSED(ctx); - const ggml_tensor *rms = cgraph->nodes[node_idx + 0]; - const ggml_tensor *mul = cgraph->nodes[node_idx + 1]; - const ggml_tensor *rope = cgraph->nodes[node_idx + 2]; - - const int mode = ((const int32_t *) rope->op_params)[2]; - - // noncontig tensors aren't tested, and don't seem common in practice - if (!ggml_is_contiguous(rms) || - !ggml_is_contiguous(mul) || - !ggml_is_contiguous(rope)) { - return false; - } - - // only norm/neox are handled in the shader - if (mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_NORMAL) { - return false; - } - - // shared memory size for passing data from mul->rope - if (mul->ne[0] > 1024) { - return false; - } - - // conditions for pipeline creation - if (sizeof(vk_op_rms_norm_mul_rope_push_constants) > ctx->device->properties.limits.maxPushConstantsSize) { - return false; - } - - return true; -} - -static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) { - - const ggml_tensor *first_node = cgraph->nodes[node_idx]; - if (first_node->op != GGML_OP_ADD) { - return 0; - } - - if (!ctx->device->multi_add) { - return 0; - } - - int32_t num_adds = 1; - while (node_idx + num_adds < cgraph->n_nodes && - cgraph->nodes[node_idx + num_adds]->op == GGML_OP_ADD && - num_adds < MAX_FUSED_ADDS) { - num_adds++; - } - - // The shader currently requires same shapes (but different strides are allowed), - // everything f32, and no misalignment - for (int32_t i = 0; i < num_adds; ++i) { - const ggml_tensor *next_node = cgraph->nodes[node_idx + i]; - if (!ggml_are_same_shape(first_node, next_node->src[0]) || - !ggml_are_same_shape(first_node, next_node->src[1]) || - next_node->type != GGML_TYPE_F32 || - next_node->src[0]->type != GGML_TYPE_F32 || - next_node->src[1]->type != GGML_TYPE_F32 || - get_misalign_bytes(ctx, next_node) || - get_misalign_bytes(ctx, next_node->src[0]) || - get_misalign_bytes(ctx, next_node->src[1])) { - num_adds = i; - } - } - - // Verify we can fuse these - ggml_op adds[MAX_FUSED_ADDS]; - for (int32_t i = 0; i < num_adds; ++i) { - adds[i] = GGML_OP_ADD; - } - - // decrease num_adds if they can't all be fused - while (num_adds > 1 && !ggml_can_fuse(cgraph, node_idx, adds, num_adds)) { - num_adds--; - } - - // a single add is not "fused", so just return zero - if (num_adds == 1) { - return 0; - } - return num_adds; -} - -static int32_t find_first_set(uint32_t x) { - int32_t ret = 0; - if (!x) { - return -1; - } - while (!(x & 1)) { - x >>= 1; - ret++; - } - return ret; -} - -static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - - if (vk_instance.debug_utils_support) { - vk::DebugUtilsLabelEXT dul = {}; - dul.pLabelName = "ggml_backend_vk_graph_compute"; - dul.color = std::array{1.0f, 1.0f, 1.0f, 1.0f}; - vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT(ctx->device->compute_queue.queue, reinterpret_cast(&dul)); - } - - ctx->prealloc_size_add_rms_partials_offset = 0; - ctx->do_add_rms_partials = false; - ctx->do_add_rms_partials_offset_calculation = false; - - int last_node = cgraph->n_nodes - 1; - - // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly - while (last_node > 0 && (ggml_vk_is_empty(cgraph->nodes[last_node]) || ((cgraph->nodes[last_node]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0))) { - last_node -= 1; - } - - // Reserve tensor context space for all nodes - ctx->tensor_ctxs.resize(cgraph->n_nodes); - - bool first_node_in_batch = true; // true if next node will be first node in a batch - int submit_node_idx = 0; // index to first node in a batch - - ggml_vk_submit_transfer_ctx(ctx); - - vk_context compute_ctx; - if (vk_perf_logger_enabled) { - // allocate/resize the query pool - if (ctx->num_queries < cgraph->n_nodes + 1) { - if (ctx->query_pool) { - ctx->device->device.destroyQueryPool(ctx->query_pool); - } - vk::QueryPoolCreateInfo query_create_info; - query_create_info.queryType = vk::QueryType::eTimestamp; - query_create_info.queryCount = cgraph->n_nodes + 100; - ctx->query_pool = ctx->device->device.createQueryPool(query_create_info); - ctx->num_queries = query_create_info.queryCount; - ctx->query_fusion_names.resize(ctx->num_queries); - ctx->query_fusion_node_count.resize(ctx->num_queries); - ctx->query_nodes.resize(ctx->num_queries); - ctx->query_node_idx.resize(ctx->num_queries); - } - - ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1); - std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr); - std::fill(ctx->query_fusion_node_count.begin(), ctx->query_fusion_node_count.end(), 0); - std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr); - std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0); - - GGML_ASSERT(ctx->compute_ctx.expired()); - compute_ctx = ggml_vk_get_compute_ctx(ctx); - ctx->query_idx = 0; - compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); - ggml_vk_sync_buffers(ctx, compute_ctx); - } - - ctx->prealloc_y_last_pipeline_used = nullptr; - ctx->prealloc_y_last_tensor_used = nullptr; - ctx->prealloc_y_last_decode_vector_staging = false; - - if (ctx->prealloc_size_add_rms_partials) { - ggml_vk_preallocate_buffers(ctx, nullptr); - compute_ctx = ggml_vk_get_compute_ctx(ctx); - // initialize partial sums to zero. - ggml_vk_buffer_memset_async(compute_ctx, ctx->prealloc_add_rms_partials, 0, 0, ctx->prealloc_size_add_rms_partials); - ggml_vk_sync_buffers(ctx, compute_ctx); - } - - // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution. - // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB - // (and scaled down based on model size, so smaller models submit earlier). - // Also submit at least every 100 nodes, in case there are workloads without as much matmul. - int nodes_per_submit = 100; - int submitted_nodes = 0; - int submit_count = 0; - uint64_t mul_mat_bytes = 0; - uint64_t total_mul_mat_bytes = 0; - uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), ctx->last_total_mul_mat_bytes / 40u); - for (int i = 0; i < cgraph->n_nodes; i++) { - if (first_node_in_batch) { - submit_node_idx = i; - } - - if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) { - auto bytes = ggml_nbytes(cgraph->nodes[i]->src[0]); - mul_mat_bytes += bytes; - total_mul_mat_bytes += bytes; - } - - // op_srcs_fused_elementwise indicates whether an op's srcs all contribute to - // the fused result in an elementwise-way. This affects whether the memory for - // the src is allowed to overlap the memory for the destination. - // The array is sized to handle the largest fusion (asserted later). - bool op_srcs_fused_elementwise[12]; - - ctx->fused_topk_moe_mode = TOPK_MOE_COUNT; - ctx->fused_topk_moe_scale = false; - const char *fusion_string {}; - if (!ctx->device->disable_fusion) { - uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i); - if (num_adds) { - ctx->num_additional_fused_ops = num_adds - 1; - fusion_string = "MULTI_ADD"; - std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, true); - } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_ADD })) { - ctx->num_additional_fused_ops = 2; - fusion_string = "MUL_MAT_ADD_ADD"; - op_srcs_fused_elementwise[0] = false; - op_srcs_fused_elementwise[1] = true; - op_srcs_fused_elementwise[2] = true; - } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) { - ctx->num_additional_fused_ops = 1; - fusion_string = "MUL_MAT_ADD"; - op_srcs_fused_elementwise[0] = false; - op_srcs_fused_elementwise[1] = true; - } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL })) { - ctx->num_additional_fused_ops = 2; - fusion_string = "MUL_MAT_ID_ADD_ID_MUL"; - op_srcs_fused_elementwise[0] = false; - op_srcs_fused_elementwise[1] = true; - op_srcs_fused_elementwise[2] = true; - } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) { - ctx->num_additional_fused_ops = 1; - fusion_string = "MUL_MAT_ID_ADD_ID"; - op_srcs_fused_elementwise[0] = false; - op_srcs_fused_elementwise[1] = true; - } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_MUL })) { - ctx->num_additional_fused_ops = 1; - fusion_string = "MUL_MAT_ID_MUL"; - op_srcs_fused_elementwise[0] = false; - op_srcs_fused_elementwise[1] = true; - } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 4 }) && - ggml_check_edges(cgraph, i, rms_norm_mul_rope_view_set_rows_edges) && - ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i) && - ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i + 2)) { - ctx->num_additional_fused_ops = 4; - fusion_string = "RMS_NORM_MUL_ROPE_VIEW_SET_ROWS"; - op_srcs_fused_elementwise[0] = false; - op_srcs_fused_elementwise[1] = false; - op_srcs_fused_elementwise[2] = false; - op_srcs_fused_elementwise[3] = false; - op_srcs_fused_elementwise[4] = false; - } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE })&& - ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i)) { - ctx->num_additional_fused_ops = 2; - fusion_string = "RMS_NORM_MUL_ROPE"; - // rope is approximately elementwise - whole rows are done by a single workgroup and it's row-wise - op_srcs_fused_elementwise[0] = false; - op_srcs_fused_elementwise[1] = true; - op_srcs_fused_elementwise[2] = true; - } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { - ctx->num_additional_fused_ops = 1; - fusion_string = "RMS_NORM_MUL"; - // rms_norm is not elementwise, but whole rows must be consumed and the scale factor computed before - // they are overwritten, and one workgroup per row. So close enough. - op_srcs_fused_elementwise[0] = true; - op_srcs_fused_elementwise[1] = true; - } else if (ggml_vk_can_fuse_ssm_conv(ctx, cgraph, i, 2)) { - ctx->num_additional_fused_ops = 2; - fusion_string = "SSM_CONV_BIAS_SILU"; - // ssm_conv reads multiple input tokens per output, so it's not elementwise w.r.t. its srcs. - // The downstream add and silu are elementwise on the conv output. - op_srcs_fused_elementwise[0] = false; - op_srcs_fused_elementwise[1] = true; - op_srcs_fused_elementwise[2] = true; - } else if (ggml_vk_can_fuse_ssm_conv(ctx, cgraph, i, 1)) { - ctx->num_additional_fused_ops = 1; - fusion_string = "SSM_CONV_SILU"; - op_srcs_fused_elementwise[0] = false; - op_srcs_fused_elementwise[1] = true; - } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) && - ggml_check_edges(cgraph, i, rope_view_set_rows_edges) && - ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) { - ctx->num_additional_fused_ops = 2; - fusion_string = "ROPE_VIEW_SET_ROWS"; - op_srcs_fused_elementwise[0] = false; - op_srcs_fused_elementwise[1] = false; - op_srcs_fused_elementwise[2] = false; - } else if (ggml_vk_can_fuse_snake(ctx, cgraph, i)) { - ctx->num_additional_fused_ops = 4; - fusion_string = "SNAKE"; - // elementwise=true: snake.comp is safe under exact aliasing because each - // thread reads data_x[idx] into a register before writing data_d[idx] - // with a data dependency on that register. The overlap check still - // rejects partial overlaps (different base or size). - std::fill_n(op_srcs_fused_elementwise, 5, true); - } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) && - ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) && - ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) { - ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1; - // view of argsort writes to memory - ctx->fused_ops_write_mask |= 1 << 3; - ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX_NORM; - fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM"; - std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false); - } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_sigmoid_norm_bias, { i + 4, i + 10 }) && - ggml_check_edges(cgraph, i, topk_moe_sigmoid_norm_bias_edges) && - ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_SIGMOID_NORM_BIAS)) { - ctx->num_additional_fused_ops = topk_moe_sigmoid_norm_bias.size() - 1; - // view of argsort writes to memory - ctx->fused_ops_write_mask |= 1 << 4; - ctx->fused_topk_moe_mode = TOPK_MOE_SIGMOID_NORM_BIAS; - fusion_string = "TOPK_MOE_SIGMOID_NORM_BIAS"; - std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false); - } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) && - ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) && - ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) { - ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1; - // view of argsort writes to memory - ctx->fused_ops_write_mask |= 1 << 3; - ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX; - fusion_string = "TOPK_MOE_EARLY_SOFTMAX"; - std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false); - } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) && - ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) && - ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) { - ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1; - // view of argsort writes to memory - ctx->fused_ops_write_mask |= 1 << 1; - ctx->fused_topk_moe_mode = TOPK_MOE_LATE_SOFTMAX; - fusion_string = "TOPK_MOE_LATE_SOFTMAX"; - std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false); - } - if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) { - // Look for an additional scale op to fuse - occurs in deepseek2 and nemotron3 nano. - if (ggml_can_fuse_subgraph(cgraph, i + ctx->num_additional_fused_ops - 1, { GGML_OP_DIV, GGML_OP_RESHAPE, GGML_OP_SCALE }, { i + ctx->num_additional_fused_ops + 1 }) || - ggml_can_fuse_subgraph(cgraph, i + ctx->num_additional_fused_ops, { GGML_OP_GET_ROWS, GGML_OP_SCALE }, { i + ctx->num_additional_fused_ops + 1 })) { - ctx->fused_topk_moe_scale = true; - ctx->num_additional_fused_ops++; - op_srcs_fused_elementwise[ctx->num_additional_fused_ops] = false; - } - } - } - GGML_ASSERT(ctx->num_additional_fused_ops < (int)(sizeof(op_srcs_fused_elementwise) / sizeof(op_srcs_fused_elementwise[0]))); - ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops; - - // Check whether fusion would overwrite src operands while they're still in use. - // If so, disable fusion. - if (ctx->num_additional_fused_ops) { - // There are up to two output nodes - topk_moe has two. - uint32_t bits = ctx->fused_ops_write_mask & ~(1 << ctx->num_additional_fused_ops); - ggml_tensor *output_nodes[2] {}; - output_nodes[0] = cgraph->nodes[i + ctx->num_additional_fused_ops]; - if (bits) { - int output_idx = find_first_set(bits); - GGML_ASSERT(bits == (1u << output_idx)); - output_nodes[1] = cgraph->nodes[i + output_idx]; - } - - bool need_disable = false; - - // topk_moe often overwrites the source, but for a given row all the src values are - // loaded before anything is stored. If there's only one row, this is safe, so treat - // this as a special case. - bool is_topk_moe_single_row = ctx->fused_topk_moe_mode != TOPK_MOE_COUNT && - ggml_nrows(cgraph->nodes[i]->src[0]) == 1; - - if (!is_topk_moe_single_row) { - for (int j = 0; j < 2; ++j) { - ggml_tensor *dst = output_nodes[j]; - if (!dst) { - continue; - } - // Loop over all srcs of all nodes in the fusion. If the src overlaps - // the destination and the src is not an intermediate node that's being - // elided, then disable fusion. - for (int k = 0; k <= ctx->num_additional_fused_ops; ++k) { - for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) { - ggml_tensor *src = cgraph->nodes[i + k]->src[s]; - if (!src || src->op == GGML_OP_NONE) { - continue; - } - if (ggml_vk_tensors_overlap(src, dst, op_srcs_fused_elementwise[k])) { - bool found = false; - for (int n = 0; n < k; ++n) { - if (cgraph->nodes[i + n] == src) { - found = true; - break; - } - } - if (!found) { - need_disable = true; - } - } - } - } - } - } - if (need_disable) { - ctx->num_additional_fused_ops = 0; - ctx->fused_ops_write_mask = 1; - ctx->fused_topk_moe_mode = TOPK_MOE_COUNT; - ctx->fused_topk_moe_scale = false; - } - } - - // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining) - bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5; - bool submit = (submitted_nodes >= nodes_per_submit) || - (mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) || - (i + ctx->num_additional_fused_ops >= last_node) || - (almost_ready && !ctx->almost_ready_fence_pending); - - bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit); - - if (vk_perf_logger_enabled && enqueued) { - compute_ctx = ggml_vk_get_compute_ctx(ctx); - if (!vk_perf_logger_concurrent) { - // track a single node/fusion for the current query - ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i]; - ctx->query_fusion_names[ctx->query_idx] = fusion_string; - compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); - ggml_vk_sync_buffers(ctx, compute_ctx); - } else { - // track a fusion string and number of fused ops for the current node_idx - ctx->query_fusion_names[i] = fusion_string; - ctx->query_fusion_node_count[i] = ctx->num_additional_fused_ops; - } - } - - if (enqueued) { - ++submitted_nodes; - -#ifndef GGML_VULKAN_CHECK_RESULTS - if (first_node_in_batch) { - first_node_in_batch = false; - } -#endif - } - - if (submit && enqueued) { - first_node_in_batch = true; - submitted_nodes = 0; - mul_mat_bytes = 0; - if (submit_count < 3) { - mul_mat_bytes_per_submit *= 2; - } - submit_count++; - } - i += ctx->num_additional_fused_ops; - ctx->num_additional_fused_ops = 0; - ctx->fused_ops_write_mask = 0; - } - - ctx->last_total_mul_mat_bytes = total_mul_mat_bytes; - - if (vk_perf_logger_enabled) { - // End the command buffer and submit/wait - GGML_ASSERT(!ctx->compute_ctx.expired()); - compute_ctx = ctx->compute_ctx.lock(); - ggml_vk_ctx_end(compute_ctx); - - ggml_vk_submit(compute_ctx, ctx->device->fence); - VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences"); - ctx->device->device.resetFences({ ctx->device->fence }); - ctx->compute_ctx.reset(); - - // Get the results and pass them to the logger - std::vector timestamps(cgraph->n_nodes + 1); - VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results"); - if (!vk_perf_logger_concurrent) { - // Log each op separately - for (int i = 1; i < ctx->query_idx; i++) { - auto node = ctx->query_nodes[i]; - auto name = ctx->query_fusion_names[i]; - ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); - } - } else { - // Log each group of nodes - int prev_node_idx = 0; - for (int i = 1; i < ctx->query_idx; i++) { - auto cur_node_idx = ctx->query_node_idx[i]; - std::vector nodes; - std::vector names; - for (int node_idx = prev_node_idx; node_idx < cur_node_idx; ++node_idx) { - if (ggml_op_is_empty(cgraph->nodes[node_idx]->op)) { - continue; - } - nodes.push_back(cgraph->nodes[node_idx]); - names.push_back(ctx->query_fusion_names[node_idx]); - node_idx += ctx->query_fusion_node_count[node_idx]; - } - prev_node_idx = cur_node_idx; - ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); - } - } - ctx->perf_logger->print_timings(); - } - - if (!ctx->device->support_async) { - ggml_vk_synchronize(ctx); - } - - return GGML_STATUS_SUCCESS; - - UNUSED(backend); -} - -// Sort the graph for improved parallelism. -static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * graph) -{ - VK_LOG_DEBUG("ggml_vk_graph_optimize(" << graph->n_nodes << " nodes)"); - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - - if (ctx->device->disable_graph_optimize) { - return; - } - - auto const &is_empty = [](ggml_tensor * node) -> bool { - return node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; - }; - - auto const &is_src_of = [](const ggml_tensor *dst, const ggml_tensor *src) -> bool { - for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) { - if (dst->src[s] == src) { - return true; - } - } - // implicit dependency if they view the same tensor - const ggml_tensor *dst2 = dst->view_src ? dst->view_src : dst; - const ggml_tensor *src2 = src->view_src ? src->view_src : src; - if (dst2 == src2) { - return true; - } - return false; - }; - - std::vector new_order; - std::vector used(graph->n_nodes, false); - std::set used_node_set; - - int first_unused = 0; - while (first_unused < graph->n_nodes) { - std::vector current_set; - - // Check for fusion patterns and avoid reordering them - auto const &match_pattern = [&](const std::initializer_list &pattern, int start) -> bool { - if (start + (int)pattern.size() <= graph->n_nodes) { - bool is_pattern = true; - for (size_t j = 0; j < pattern.size(); ++j) { - if (graph->nodes[start + j]->op != pattern.begin()[j] || used[start + j]) { - is_pattern = false; - } - } - return is_pattern; - } - return false; - }; - - auto const &keep_pattern = [&](const std::initializer_list &pattern) -> bool { - if (match_pattern(pattern, first_unused)) { - for (size_t j = 0; j < pattern.size(); ++j) { - new_order.push_back(graph->nodes[first_unused + j]); - used_node_set.insert(graph->nodes[first_unused + j]); - used[first_unused + j] = true; - } - while (first_unused < graph->n_nodes && used[first_unused]) { - first_unused++; - } - return true; - } - return false; - }; - - if (keep_pattern(topk_moe_early_softmax_norm)) { - continue; - } - if (keep_pattern(topk_moe_sigmoid_norm_bias)) { - continue; - } - if (keep_pattern(topk_moe_early_softmax)) { - continue; - } - if (keep_pattern(topk_moe_late_softmax)) { - continue; - } - if (keep_pattern(snake_pattern)) { - continue; - } - - // First, grab the next unused node. - current_set.push_back(first_unused); - - // Loop through the next N nodes. Grab any that don't depend on other nodes that - // haven't already been run. Nodes that have already been run have used[i] set - // to true. Allow nodes that depend on the previous node if it's a fusion pattern - // that we support (e.g. RMS_NORM + MUL). - // This first pass only grabs "real" (non-view nodes). Second pass grabs view nodes. - // The goal is to not interleave real and view nodes in a way that breaks fusion. - const int NUM_TO_CHECK = 20; - for (int j = first_unused+1; j < std::min(first_unused + NUM_TO_CHECK, graph->n_nodes); ++j) { - if (used[j]) { - continue; - } - if (is_empty(graph->nodes[j])) { - continue; - } - // Don't pull forward nodes from fusion patterns - if (match_pattern(topk_moe_early_softmax_norm, j) || - match_pattern(topk_moe_sigmoid_norm_bias, j) || - match_pattern(topk_moe_early_softmax, j) || - match_pattern(topk_moe_late_softmax, j) || - match_pattern(snake_pattern, j)) { - continue; - } - bool ok = true; - for (int c = first_unused; c < j; ++c) { - if (!used[c] && - is_src_of(graph->nodes[j], graph->nodes[c]) && - !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_RMS_NORM && graph->nodes[j]->op == GGML_OP_MUL) && - !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT && graph->nodes[j]->op == GGML_OP_ADD) && - !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_ADD_ID) && - !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_MUL) && - !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_ADD && graph->nodes[j]->op == GGML_OP_ADD) && - !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_SSM_CONV && graph->nodes[j]->op == GGML_OP_ADD) && - !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_SSM_CONV && graph->nodes[j]->op == GGML_OP_UNARY)) { - ok = false; - break; - } - } - if (ok) { - current_set.push_back(j); - - int rope_idx = j; - - // When we've found RMS_NORM + MUL, try to find a ROPE that uses it - if (j > 0 && - graph->nodes[j]->op == GGML_OP_MUL && - graph->nodes[j-1]->op == GGML_OP_RMS_NORM) { - for (int k = j + 1; k < std::min(j + 15, graph->n_nodes); ++k) { - if (graph->nodes[k]->op == GGML_OP_ROPE && - graph->nodes[k]->src[0] == graph->nodes[j] && - // Check that other srcs are already valid - graph->nodes[k]->src[1]->op == GGML_OP_NONE && - (graph->nodes[k]->src[2] == nullptr || graph->nodes[k]->src[2]->op == GGML_OP_NONE)) { - rope_idx = k; - current_set.push_back(rope_idx); - used[rope_idx] = true; - break; - } - } - } - // Look for ROPE + VIEW + SET_ROWS and make them consecutive - if (graph->nodes[rope_idx]->op == GGML_OP_ROPE) { - int view_idx = -1; - int set_rows_idx = -1; - for (int k = rope_idx+1; k < std::min(rope_idx + 10, graph->n_nodes); ++k) { - if (view_idx == -1 && - graph->nodes[k]->op == GGML_OP_VIEW && - graph->nodes[k]->src[0] == graph->nodes[rope_idx]) { - view_idx = k; - continue; - } - if (view_idx != -1 && - set_rows_idx == -1 && - graph->nodes[k]->op == GGML_OP_SET_ROWS && - graph->nodes[k]->src[0] == graph->nodes[view_idx]) { - set_rows_idx = k; - break; - } - } - if (set_rows_idx != -1) { - current_set.push_back(view_idx); - current_set.push_back(set_rows_idx); - used[view_idx] = true; - used[set_rows_idx] = true; - } - } - // Look for MUL_MAT_ID + ADD_ID + MUL - if (j > 0 && - graph->nodes[j]->op == GGML_OP_ADD_ID && - graph->nodes[j-1]->op == GGML_OP_MUL_MAT_ID) { - for (int k = j + 1; k < std::min(j + 15, graph->n_nodes); ++k) { - if (graph->nodes[k]->op == GGML_OP_MUL && - graph->nodes[k]->src[0] == graph->nodes[j] && - // src1 must either be weights or already processed - (graph->nodes[k]->src[1]->op == GGML_OP_NONE || used_node_set.find(graph->nodes[k]->src[1]) != used_node_set.end())) { - current_set.push_back(k); - used[k] = true; - break; - } - } - } - // Look for MUL_MAT + ADD + ADD - if (j > 0 && - graph->nodes[j]->op == GGML_OP_ADD && - graph->nodes[j-1]->op == GGML_OP_MUL_MAT) { - for (int k = j + 1; k < std::min(j + 15, graph->n_nodes); ++k) { - if (graph->nodes[k]->op == GGML_OP_ADD && - graph->nodes[k]->src[0] == graph->nodes[j] && - // src1 must either be weights or already processed - (graph->nodes[k]->src[1]->op == GGML_OP_NONE || used_node_set.find(graph->nodes[k]->src[1]) != used_node_set.end())) { - current_set.push_back(k); - used[k] = true; - break; - } - } - } - // SSM_CONV + ADD + UNARY: pull the consuming UNARY forward - if (j > 0 && - graph->nodes[j]->op == GGML_OP_ADD && - graph->nodes[j-1]->op == GGML_OP_SSM_CONV) { - for (int k = j + 1; k < std::min(j + 15, graph->n_nodes); ++k) { - if (graph->nodes[k]->op == GGML_OP_UNARY && - graph->nodes[k]->src[0] == graph->nodes[j]) { - current_set.push_back(k); - used[k] = true; - break; - } - } - } - } - } - // Second pass grabs view nodes. - // Skip this if it would break a fusion optimization (don't split up add->rms_norm or add->add). - if (graph->nodes[current_set.back()]->op != GGML_OP_ADD) { - for (int j = first_unused+1; j < std::min(first_unused + NUM_TO_CHECK, graph->n_nodes); ++j) { - if (used[j]) { - continue; - } - if (!is_empty(graph->nodes[j])) { - continue; - } - bool ok = true; - for (int c = first_unused; c < j; ++c) { - bool c_in_current_set = std::find(current_set.begin(), current_set.end(), c) != current_set.end(); - // skip views whose srcs haven't been processed. - if (!used[c] && - is_src_of(graph->nodes[j], graph->nodes[c]) && - !c_in_current_set) { - ok = false; - break; - } - } - if (ok) { - current_set.push_back(j); - } - } - } - - // Push the current set into new_order - for (auto c : current_set) { - new_order.push_back(graph->nodes[c]); - used_node_set.insert(graph->nodes[c]); - used[c] = true; - } - while (first_unused < graph->n_nodes && used[first_unused]) { - first_unused++; - } - } - // Replace the graph with the new order. - for (int i = 0; i < graph->n_nodes; ++i) { - graph->nodes[i] = new_order[i]; - } -} - -static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_event_t event) { - VK_LOG_DEBUG("ggml_backend_vk_event_record(backend=" << backend << ", event=" << event << ")"); - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - vk_event *vkev = (vk_event *)event->context; - - ggml_vk_submit_transfer_ctx(ctx); - - vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); - auto* cmd_buf = compute_ctx->s->buffer; // retrieve pointer before it gets reset - - if (vkev->has_event) { - // Move existing event into submitted - vkev->events_submitted.push_back(vkev->event); - } - - // Grab the next event and record it, create one if necessary - if (vkev->events_free.empty()) { - vkev->event = ctx->device->device.createEvent({}); - } else { - vkev->event = vkev->events_free.back(); - vkev->events_free.pop_back(); - } - - vkev->has_event = true; - - ggml_vk_set_event(compute_ctx, vkev->event); - - vkev->tl_semaphore.value++; - compute_ctx->s->signal_semaphores.push_back(vkev->tl_semaphore); - ggml_vk_ctx_end(compute_ctx); - - ggml_vk_submit(compute_ctx, {}); - ctx->submit_pending = true; - vkev->cmd_buffer = cmd_buf; - vkev->cmd_buffer_use_counter = cmd_buf->use_counter; - ctx->compute_ctx.reset(); -} - -static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { - VK_LOG_DEBUG("ggml_backend_vk_event_wait(backend=" << backend << ", event=" << event << ")"); - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - vk_event *vkev = (vk_event *)event->context; - - vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); - - if (vkev->has_event) { - // Wait for latest event - ggml_vk_wait_events(compute_ctx, { vkev->event }); - } -} - -// TODO: enable async and synchronize -static ggml_backend_i ggml_backend_vk_interface = { - /* .get_name = */ ggml_backend_vk_name, - /* .free = */ ggml_backend_vk_free, - /* .set_tensor_async = */ ggml_backend_vk_set_tensor_async, - /* .get_tensor_async = */ ggml_backend_vk_get_tensor_async, - /* .set_tensor_2d_async = */ ggml_backend_vk_set_tensor_2d_async, - /* .get_tensor_2d_async = */ ggml_backend_vk_get_tensor_2d_async, - /* .cpy_tensor_async = */ ggml_backend_vk_cpy_tensor_async, - /* .synchronize = */ ggml_backend_vk_synchronize, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_vk_graph_compute, - /* .event_record = */ ggml_backend_vk_event_record, - /* .event_wait = */ ggml_backend_vk_event_wait, - /* .graph_optimize = */ ggml_vk_graph_optimize, -}; - -static ggml_guid_t ggml_backend_vk_guid() { - static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; - return &guid; -} - -ggml_backend_t ggml_backend_vk_init(size_t dev_num) { - VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")"); - - ggml_backend_vk_context * ctx = new ggml_backend_vk_context; - ggml_vk_init(ctx, dev_num); - - ggml_backend_t vk_backend = new ggml_backend { - /* .guid = */ ggml_backend_vk_guid(), - /* .iface = */ ggml_backend_vk_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num), - /* .context = */ ctx, - }; - - if (!ctx->device->support_async) { - vk_backend->iface.get_tensor_async = nullptr; - } - - return vk_backend; -} - -bool ggml_backend_is_vk(ggml_backend_t backend) { - return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid()); -} - -int ggml_backend_vk_get_device_count() { - return ggml_vk_get_device_count(); -} - -void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) { - GGML_ASSERT(device < (int) vk_instance.device_indices.size()); - int dev_idx = vk_instance.device_indices[device]; - ggml_vk_get_device_description(dev_idx, description, description_size); -} - -void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { - GGML_ASSERT(device < (int) vk_instance.device_indices.size()); - GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size()); - - vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; - vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops; - vk::PhysicalDeviceMemoryProperties2 memprops = {}; - const bool membudget_supported = vk_instance.device_supports_membudget[device]; - const bool is_integrated_gpu = vkdev.getProperties().deviceType == vk::PhysicalDeviceType::eIntegratedGpu; - - if (membudget_supported) { - memprops.pNext = &budgetprops; - } - vkdev.getMemoryProperties2(&memprops); - - *total = 0; - *free = 0; - - for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) { - const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i]; - - if (is_integrated_gpu || (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal)) { - *total += heap.size; - - if (membudget_supported && i < budgetprops.heapUsage.size()) { - *free += budgetprops.heapBudget[i] - budgetprops.heapUsage[i]; - } else { - *free += heap.size; - } - } - } -} - -static vk::PhysicalDeviceType ggml_backend_vk_get_device_type(int device_idx) { - GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size()); - - vk::PhysicalDevice device = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device_idx]]; - - vk::PhysicalDeviceProperties2 props = {}; - device.getProperties2(&props); - - return props.properties.deviceType; -} - -static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { - GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size()); - - vk::PhysicalDevice device = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device_idx]]; - - const std::vector ext_props = device.enumerateDeviceExtensionProperties(); - - bool ext_support = false; - - for (const auto& properties : ext_props) { - if (strcmp("VK_EXT_pci_bus_info", properties.extensionName) == 0) { - ext_support = true; - break; - } - } - - if (!ext_support) { - return ""; - } - - vk::PhysicalDeviceProperties2 props = {}; - vk::PhysicalDevicePCIBusInfoPropertiesEXT pci_bus_info = {}; - - props.pNext = &pci_bus_info; - - device.getProperties2(&props); - - const uint32_t pci_domain = pci_bus_info.pciDomain; - const uint32_t pci_bus = pci_bus_info.pciBus; - const uint32_t pci_device = pci_bus_info.pciDevice; - const uint8_t pci_function = (uint8_t) pci_bus_info.pciFunction; // pci function is between 0 and 7, prevent printf overflow warning - - char pci_bus_id[16] = {}; - snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function); - - return std::string(pci_bus_id); -} - -////////////////////////// - -struct ggml_backend_vk_device_context { - size_t device; - std::string name; - std::string description; - bool is_integrated_gpu; - std::string pci_bus_id; - int op_offload_min_batch_size; -}; - -static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - return ctx->name.c_str(); -} - -static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t dev) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - return ctx->description.c_str(); -} - -static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context; - ggml_backend_vk_get_device_memory(ctx->device, free, total); -} - -static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - return ggml_backend_vk_buffer_type(ctx->device); -} - -static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(ggml_backend_dev_t dev) { - UNUSED(dev); - return ggml_backend_vk_host_buffer_type(); -} - -static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - - return ctx->is_integrated_gpu ? GGML_BACKEND_DEVICE_TYPE_IGPU : GGML_BACKEND_DEVICE_TYPE_GPU; -} - -static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - - props->name = ggml_backend_vk_device_get_name(dev); - props->description = ggml_backend_vk_device_get_description(dev); - props->type = ggml_backend_vk_device_get_type(dev); - props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); - ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); - props->caps = { - /* .async = */ true, - /* .host_buffer = */ true, - /* .buffer_from_host_ptr = */ false, - /* .events = */ true, - }; -} - -static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) { - UNUSED(params); - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - return ggml_backend_vk_init(ctx->device); -} - -static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - const vk_device& device = ggml_vk_get_device(ctx->device); - - const bool uses_bda = (op->op == GGML_OP_IM2COL || op->op == GGML_OP_IM2COL_3D) && - device->shader_int64 && device->buffer_device_address; - - auto const & tensor_size_supported = [&](size_t tensor_size) { - if (tensor_size > device->max_buffer_size) { - return false; - } - // For im2col shaders using BDA, maxStorageBufferRange limit doesn't apply. - // If shader64BitIndexing is enabled, maxStorageBufferRange limit doesn't apply. - if (!uses_bda && !device->shader_64b_indexing) { - if (tensor_size > device->properties.limits.maxStorageBufferRange) { - return false; - } - } - return true; - }; - // reject any tensors larger than the max buffer size - for (int i = 0; i < GGML_MAX_SRC; i++) { - if (op->src[i] && !tensor_size_supported(ggml_nbytes(op->src[i]))) { - return false; - } - } - if (!tensor_size_supported(ggml_nbytes(op))) { - return false; - } - - switch (op->op) { - case GGML_OP_UNARY: - switch (ggml_get_unary_op(op)) { - case GGML_UNARY_OP_EXP: - case GGML_UNARY_OP_EXPM1: - case GGML_UNARY_OP_ELU: - case GGML_UNARY_OP_GELU: - case GGML_UNARY_OP_GELU_ERF: - case GGML_UNARY_OP_GELU_QUICK: - case GGML_UNARY_OP_SILU: - case GGML_UNARY_OP_RELU: - case GGML_UNARY_OP_XIELU: - case GGML_UNARY_OP_NEG: - case GGML_UNARY_OP_TANH: - case GGML_UNARY_OP_SIGMOID: - case GGML_UNARY_OP_HARDSIGMOID: - case GGML_UNARY_OP_HARDSWISH: - case GGML_UNARY_OP_ABS: - case GGML_UNARY_OP_SOFTPLUS: - case GGML_UNARY_OP_STEP: - case GGML_UNARY_OP_ROUND: - case GGML_UNARY_OP_CEIL: - case GGML_UNARY_OP_FLOOR: - case GGML_UNARY_OP_TRUNC: - case GGML_UNARY_OP_SGN: - return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && - (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && - (op->src[0]->type == op->type); - default: - return false; - } - case GGML_OP_GLU: - switch (ggml_get_glu_op(op)) { - case GGML_GLU_OP_GEGLU: - case GGML_GLU_OP_REGLU: - case GGML_GLU_OP_SWIGLU: - case GGML_GLU_OP_SWIGLU_OAI: - case GGML_GLU_OP_GEGLU_ERF: - case GGML_GLU_OP_GEGLU_QUICK: - return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && - (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && - (op->src[0]->type == op->type) && - (!op->src[1] || op->src[1]->type == op->src[0]->type); - default: - return false; - } - case GGML_OP_MUL_MAT: - case GGML_OP_MUL_MAT_ID: - { - ggml_type src0_type = op->src[0]->type; - if (op->op == GGML_OP_MUL_MAT_ID) { - if (!device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) { - // If there's not enough shared memory for row_ids and the result tile, fallback to CPU - return false; - } - } - switch (src0_type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_Q1_0: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_MXFP4: - case GGML_TYPE_NVFP4: - break; - default: - return false; - } - struct ggml_tensor * a; - struct ggml_tensor * b; - if (op->op == GGML_OP_MUL_MAT) { - a = op->src[0]; - b = op->src[1]; - } else { - a = op->src[2]; - b = op->src[1]; - } - if (a->ne[3] != b->ne[3]) { - return false; - } - if (!(ggml_vk_dim01_contiguous(op->src[0]) || op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_BF16) || - !(ggml_vk_dim01_contiguous(op->src[1]) || op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16)) { - return false; - } - if (op->src[0]->type == GGML_TYPE_BF16 && op->src[1]->type == GGML_TYPE_F16) { - // We currently don't have a bf16 x f16 shader, or an fp16->bf16 copy shader. - // So don't support this combination for now. - return false; - } - - return true; - } - case GGML_OP_FLASH_ATTN_EXT: - { - bool coopmat2 = device->coopmat2; - uint32_t HSK = op->src[1]->ne[0]; - uint32_t HSV = op->src[2]->ne[0]; - if ((HSK % 8) != 0 || (HSV % 8) != 0) { - return false; - } - if (op->src[4] && op->src[4]->type != GGML_TYPE_F32) { - return false; - } - if (op->src[0]->type != GGML_TYPE_F32) { - return false; - } - if (op->type != GGML_TYPE_F32) { - return false; - } - if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) { - return false; - } - auto fa_kv_ok = [coopmat2](ggml_type t) { - switch (t) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_0: - return true; - case GGML_TYPE_Q1_0: - return coopmat2; - default: - return false; - } - }; - if (!fa_kv_ok(op->src[1]->type) || !fa_kv_ok(op->src[2]->type)) { - return false; - } - if ((op->src[1]->type == GGML_TYPE_BF16) != (op->src[2]->type == GGML_TYPE_BF16)) { - return false; - } - if (!coopmat2 && !(device->subgroup_shuffle && device->subgroup_vote)) { - // scalar/coopmat1 FA uses subgroupShuffle/subgroupAll - return false; - } - return true; - } - case GGML_OP_GET_ROWS: - { - switch (op->src[0]->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_Q1_0: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_MXFP4: - case GGML_TYPE_NVFP4: - case GGML_TYPE_I32: - return true; - default: - return false; - } - } - case GGML_OP_SET_ROWS: - { - switch (op->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_Q1_0: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_IQ4_NL: - return true; - default: - return false; - } - } - case GGML_OP_CONT: - case GGML_OP_CPY: - case GGML_OP_DUP: - { - ggml_type src0_type = op->src[0]->type; - ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type; - - if (src0_type == GGML_TYPE_F32) { - switch (src1_type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_Q1_0: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_IQ4_NL: - return true; - default: - break; - } - } - if (src1_type == GGML_TYPE_F32) { - switch (src0_type) { - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_Q1_0: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_IQ4_NL: - return true; - default: - break; - } - } - - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { - return true; - } - - if ( - (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) || - (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32) - ) { - return true; - } - - // We can handle copying from a type to the same type if it's - // either not quantized or is quantized and contiguous. - // We use f16 or f32 shaders to do the copy, - // so the type/block size must be a multiple of 4. - if (src0_type == src1_type && - (!ggml_is_quantized(src0_type) || (ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op))) && - (ggml_type_size(src0_type) % 2) == 0) { - return true; - } - return false; - } - case GGML_OP_REPEAT: - return ggml_type_size(op->type) == ggml_type_size(op->src[0]->type) && - (ggml_type_size(op->type) == sizeof(float) || ggml_type_size(op->type) == 2); - case GGML_OP_REPEAT_BACK: - return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32; - case GGML_OP_ROPE: - return ggml_is_contiguous_rows(op) && ggml_is_contiguous_rows(op->src[0]); - case GGML_OP_ROPE_BACK: - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - case GGML_OP_RMS_NORM: - return true; - case GGML_OP_NORM: - case GGML_OP_GROUP_NORM: - return ggml_is_contiguous(op->src[0]); - case GGML_OP_L2_NORM: - return ggml_is_contiguous_rows(op->src[0]) && - op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; - case GGML_OP_ADD: - case GGML_OP_SUB: - case GGML_OP_MUL: - case GGML_OP_DIV: - return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && - (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16) && - (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16); - case GGML_OP_ADD_ID: - return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->src[2]->type == GGML_TYPE_I32 && - op->type == GGML_TYPE_F32; - case GGML_OP_SILU_BACK: - case GGML_OP_RMS_NORM_BACK: - return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_SIN: - case GGML_OP_COS: - case GGML_OP_CLAMP: - return op->src[0]->type == GGML_TYPE_F32; - case GGML_OP_LEAKY_RELU: - case GGML_OP_OPT_STEP_ADAMW: - case GGML_OP_OPT_STEP_SGD: - return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; - case GGML_OP_LOG: - case GGML_OP_TRI: - case GGML_OP_DIAG: - return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && - op->type == op->src[0]->type; - case GGML_OP_ARGSORT: - { - if (!ggml_is_contiguous(op) || !ggml_is_contiguous(op->src[0])) { - return false; - } - // pipeline_argsort_large_f32 requires vulkan memory model. - if (device->vulkan_memory_model) { - return true; - } else { - return op->ne[0] <= (1 << device->max_workgroup_size_log2); - } - } - case GGML_OP_TOP_K: - { - if (!ggml_is_contiguous(op) || !ggml_is_contiguous(op->src[0])) { - return false; - } - // We could potentially support larger, using argsort to sort the - // whole thing. Not clear if this is needed. - uint32_t min_pipeline = (uint32_t)log2f(float(op->ne[0])) + 1; - if (min_pipeline >= num_topk_pipelines || - !device->pipeline_topk_f32[min_pipeline]) { - return false; - } - } - return true; - case GGML_OP_UPSCALE: - if (op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS) { - if ((op->op_params[0] & 0xFF) != GGML_SCALE_MODE_BILINEAR) { - return false; - } - } - return op->src[0]->type == GGML_TYPE_F32; - case GGML_OP_ACC: - return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; - case GGML_OP_SET: - return op->src[0]->type == op->src[1]->type && op->src[0]->type == op->type && - (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_I32); - case GGML_OP_CONCAT: { - if (op->src[0]->type != op->src[1]->type || op->src[0]->type != op->type) { - return false; - } - const size_t type_size = ggml_type_size(op->type); - return ggml_blck_size(op->type) == 1 && - (type_size == 1 || type_size == 2 || type_size == 4 || type_size == 8); - } - case GGML_OP_ADD1: - return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32) - || (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32) - || (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16); - case GGML_OP_ARANGE: - return op->type == GGML_TYPE_F32; - case GGML_OP_FILL: - return op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16; - case GGML_OP_SCALE: - return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; - case GGML_OP_PAD: - case GGML_OP_ROLL: - return op->src[0]->type == GGML_TYPE_F32; - case GGML_OP_DIAG_MASK_INF: - return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; - case GGML_OP_SOFT_MAX: - return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32 - && (!op->src[1] || (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16)); - case GGML_OP_SOFT_MAX_BACK: - return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32 - && ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32; - case GGML_OP_SUM: - case GGML_OP_SUM_ROWS: - case GGML_OP_MEAN: - return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous_rows(op->src[0]); - case GGML_OP_CUMSUM: - { - if (device->subgroup_arithmetic && device->subgroup_require_full_support) { - return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous_rows(op->src[0]); - } - return false; - } - case GGML_OP_SOLVE_TRI: - { - if (op->type != GGML_TYPE_F32 || op->src[0]->type != GGML_TYPE_F32) { - return false; - } - const uint32_t N = op->src[0]->ne[0]; - const uint32_t K = op->src[1]->ne[0]; - // K dimension limited to workgroup size - if (K > 1u << device->max_workgroup_size_log2) { - return false; - } - const uint32_t batch_N = device->properties.limits.maxComputeSharedMemorySize / ((N + K) * sizeof(float)); - - if (batch_N == 0) { - return false; - } - return true; - } - case GGML_OP_ARGMAX: - return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; - case GGML_OP_COUNT_EQUAL: - return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_I32 - && ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_I32; - case GGML_OP_IM2COL: - return ggml_is_contiguous(op->src[1]) - && op->src[1]->type == GGML_TYPE_F32 - && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16); - case GGML_OP_IM2COL_3D: - return op->src[1]->type == GGML_TYPE_F32 - && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16); - case GGML_OP_TIMESTEP_EMBEDDING: - return op->src[0]->type == GGML_TYPE_F32; - case GGML_OP_CONV_2D_DW: - return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) - && op->src[1]->type == GGML_TYPE_F32; - case GGML_OP_POOL_2D: - return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; - case GGML_OP_RWKV_WKV6: - case GGML_OP_RWKV_WKV7: - return true; // all inputs are contiguous, see ggml.c - case GGML_OP_GATED_DELTA_NET: - { - const uint32_t S_v = op->src[2]->ne[0]; - if (S_v != 16 && S_v != 32 && S_v != 64 && S_v != 128) { - return false; - } - for (int i = 0; i < 6; i++) { - if (op->src[i] == nullptr || op->src[i]->type != GGML_TYPE_F32) { - return false; - } - } - return op->type == GGML_TYPE_F32; - } - case GGML_OP_SSM_SCAN: - { - for (int i = 0; i < 6; i++) { - if (op->src[i] && ggml_is_quantized(op->src[i]->type)) { - return false; - } - } - if (op->src[6] && op->src[6]->type != GGML_TYPE_I32) { - return false; - } - if (op->src[0]->type != GGML_TYPE_F32 || op->type != GGML_TYPE_F32) { - return false; - } - - const uint32_t d_state = op->src[0]->ne[0]; - const uint32_t head_dim = op->src[0]->ne[1]; - - bool is_mamba2 = (op->src[3] && op->src[3]->nb[1] == sizeof(float)); - if (!is_mamba2) { - return false; - } - - if ((d_state != 128 && d_state != 256) || head_dim % 16 != 0) { - return false; - } - - size_t shmem_size = d_state * sizeof(float); - - if (shmem_size > device->properties.limits.maxComputeSharedMemorySize) { - return false; - } - - if (!device->subgroup_basic) { - return false; - } - - return true; - } - case GGML_OP_SSM_CONV: - return op->src[0]->type == GGML_TYPE_F32; - case GGML_OP_CONV_TRANSPOSE_1D: - return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; - case GGML_OP_COL2IM_1D: - return (op->src[0]->type == GGML_TYPE_F32 || - op->src[0]->type == GGML_TYPE_F16 || - op->src[0]->type == GGML_TYPE_BF16) && - op->type == op->src[0]->type && - ggml_is_contiguous(op->src[0]) && - ggml_is_contiguous(op); - case GGML_OP_CONV_2D: - case GGML_OP_CONV_TRANSPOSE_2D: - { - // Channel-contiguous format is not supported yet. - return ((op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && - op->src[1]->type == GGML_TYPE_F32 && - op->type == GGML_TYPE_F32 && - ggml_is_contiguous(op->src[0]) && - ggml_is_contiguous(op->src[1]) && - ggml_is_contiguous(op)); - } - default: - return false; - } - - UNUSED(dev); -} - -static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) { - return false; - } - - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; - - return buft_ctx->device->idx == ctx->device; -} - -static int64_t ggml_vk_get_op_batch_size(const ggml_tensor * op) { - switch (op->op) { - case GGML_OP_GET_ROWS: - return 0; - case GGML_OP_MUL_MAT: - return op->ne[1]; - case GGML_OP_MUL_MAT_ID: - case GGML_OP_ROPE: - case GGML_OP_ROPE_BACK: - return op->ne[2]; - default: - return ggml_nrows(op); - } -} - -static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context; - - return ggml_vk_get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size; -} - -static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - auto device = ggml_vk_get_device(ctx->device); - - vk_event *vkev = new vk_event; - if (!vkev) { - return nullptr; - } - - // No events initially, they get created on demand - vkev->has_event = false; - - vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; - vk::SemaphoreCreateInfo ci{}; - ci.setPNext(&tci); - vkev->tl_semaphore = { device->device.createSemaphore(ci), 0 }; - - return new ggml_backend_event { - /* .device = */ dev, - /* .context = */ vkev, - }; -} - -static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - auto device = ggml_vk_get_device(ctx->device); - - vk_event *vkev = (vk_event *)event->context; - - device->device.destroySemaphore(vkev->tl_semaphore.s); - for (auto& event : vkev->events_free) { - device->device.destroyEvent(event); - } - for (auto& event : vkev->events_submitted) { - device->device.destroyEvent(event); - } - if (vkev->has_event) { - device->device.destroyEvent(vkev->event); - } - delete vkev; - delete event; -} - -static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) { - VK_LOG_DEBUG("ggml_backend_vk_device_event_synchronize(backend=" << dev << ", event=" << event << ")"); - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - auto device = ggml_vk_get_device(ctx->device); - vk_event *vkev = (vk_event *)event->context; - - // Only do something if the event has actually been used - if (vkev->has_event) { - vk::Semaphore sem = vkev->tl_semaphore.s; - uint64_t val = vkev->tl_semaphore.value; - vk::SemaphoreWaitInfo swi{vk::SemaphoreWaitFlags{}, sem, val}; - VK_CHECK(device->device.waitSemaphores(swi, UINT64_MAX), "event_synchronize"); - - // Reset and move submitted events - for (auto& event : vkev->events_submitted) { - device->device.resetEvent(event); - } - vkev->events_free.insert(vkev->events_free.end(), vkev->events_submitted.begin(), vkev->events_submitted.end()); - vkev->events_submitted.clear(); - - // Finished using current command buffer so we flag for reuse - if (vkev->cmd_buffer) { - // Only flag for reuse if it hasn't been reused already - if (vkev->cmd_buffer_use_counter == vkev->cmd_buffer->use_counter) { - vkev->cmd_buffer->in_use = false; - vkev->cmd_buffer->buf.reset(); - } - vkev->cmd_buffer = nullptr; - } - } -} - -static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size) { - if (!device->external_memory_host) { - return {}; - } - - uintptr_t uptr = reinterpret_cast(ptr); - if (uptr & (device->min_imported_host_pointer_alignment - 1)) { - return {}; - } - if (size & (device->min_imported_host_pointer_alignment - 1)) { - return {}; - } - - const vk::MemoryPropertyFlags property_flags = vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached; - - vk_buffer buf {}; - try { - buf = ggml_vk_create_buffer(device, size, { property_flags }, ptr); - } catch (vk::SystemError& e) { - GGML_LOG_WARN("ggml_vulkan: Failed ggml_vk_create_buffer (%s)\n", e.what()); - } - - return buf; -} - -static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { - VK_LOG_DEBUG("ggml_backend_vk_device_buffer_from_host_ptr(backend=" << dev << ", ptr=" << ptr << ", size=" << size << ")"); - GGML_UNUSED(max_tensor_size); - - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - auto device = ggml_vk_get_device(ctx->device); - - vk_buffer buf = ggml_vk_buffer_from_host_ptr(device, ptr, size); - - if (!buf) { - return {}; - } - - ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(device, std::move(buf), device->name); - - ggml_backend_buffer_t ret = ggml_backend_buffer_init(ggml_backend_vk_device_get_buffer_type(dev), ggml_backend_vk_buffer_interface, bufctx, size); - - return ret; -} - -static const struct ggml_backend_device_i ggml_backend_vk_device_i = { - /* .get_name = */ ggml_backend_vk_device_get_name, - /* .get_description = */ ggml_backend_vk_device_get_description, - /* .get_memory = */ ggml_backend_vk_device_get_memory, - /* .get_type = */ ggml_backend_vk_device_get_type, - /* .get_props = */ ggml_backend_vk_device_get_props, - /* .init_backend = */ ggml_backend_vk_device_init, - /* .get_buffer_type = */ ggml_backend_vk_device_get_buffer_type, - /* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type, - /* .buffer_from_host_ptr = */ ggml_backend_vk_device_buffer_from_host_ptr, - /* .supports_op = */ ggml_backend_vk_device_supports_op, - /* .supports_buft = */ ggml_backend_vk_device_supports_buft, - /* .offload_op = */ ggml_backend_vk_device_offload_op, - /* .event_new = */ ggml_backend_vk_device_event_new, - /* .event_free = */ ggml_backend_vk_device_event_free, - /* .event_synchronize = */ ggml_backend_vk_device_event_synchronize, -}; - -static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) { - UNUSED(reg); - return GGML_VK_NAME; -} - -static size_t ggml_backend_vk_reg_get_device_count(ggml_backend_reg_t reg) { - UNUSED(reg); - return ggml_backend_vk_get_device_count(); -} - -static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, size_t device) { - static std::vector devices; - - static bool initialized = false; - - { - static std::mutex mutex; - std::lock_guard lock(mutex); - if (!initialized) { - const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; - for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) { - ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context; - char desc[256]; - ggml_backend_vk_get_device_description(i, desc, sizeof(desc)); - ctx->device = i; - ctx->name = GGML_VK_NAME + std::to_string(i); - ctx->description = desc; - ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu; - ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i); - ctx->op_offload_min_batch_size = min_batch_size; - devices.push_back(new ggml_backend_device { - /* .iface = */ ggml_backend_vk_device_i, - /* .reg = */ reg, - /* .context = */ ctx, - }); - } - initialized = true; - } - } - - GGML_ASSERT(device < devices.size()); - return devices[device]; -} - -static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = { - /* .get_name = */ ggml_backend_vk_reg_get_name, - /* .get_device_count = */ ggml_backend_vk_reg_get_device_count, - /* .get_device = */ ggml_backend_vk_reg_get_device, - /* .get_proc_address = */ NULL, -}; - -ggml_backend_reg_t ggml_backend_vk_reg() { - static ggml_backend_reg reg = { - /* .api_version = */ GGML_BACKEND_API_VERSION, - /* .iface = */ ggml_backend_vk_reg_i, - /* .context = */ nullptr, - }; - try { - ggml_vk_instance_init(); - return ® - } catch (const vk::SystemError& e) { - VK_LOG_DEBUG("ggml_backend_vk_reg() -> Error: System error: " << e.what()); - return nullptr; - } catch (const std::exception &e) { - VK_LOG_DEBUG("ggml_backend_vk_reg() -> Error: " << e.what()); - return nullptr; - } catch (...) { - VK_LOG_DEBUG("ggml_backend_vk_reg() -> Error: unknown exception during Vulkan init"); - return nullptr; - } -} - -// Extension availability -static bool ggml_vk_instance_layer_settings_available() { -#ifdef GGML_VULKAN_VALIDATE - // Check if validation layer provides the extension - const std::string layer_name = "VK_LAYER_KHRONOS_validation"; - for (const auto& layer : vk::enumerateInstanceLayerProperties()) { - if (layer_name == layer.layerName.data()) { - for (const auto& ext : vk::enumerateInstanceExtensionProperties(layer_name)) { - if (strcmp("VK_EXT_layer_settings", ext.extensionName.data()) == 0) { - return true; - } - } - } - } - - std::cerr << "ggml_vulkan: WARNING: Validation layer or layer extension VK_EXT_layer_settings not found." << std::endl; -#endif - return false; -} -static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector& instance_extensions) { -#ifdef __APPLE__ - // Check for portability enumeration extension for MoltenVK support - for (const auto& properties : instance_extensions) { - if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) { - return true; - } - } - std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl; -#endif - return false; - - UNUSED(instance_extensions); -} - -// Extension availability -static bool ggml_vk_instance_debug_utils_ext_available( - const std::vector & instance_extensions) { - // Check for portability enumeration extension for MoltenVK support - for (const auto & properties : instance_extensions) { - if (strcmp("VK_EXT_debug_utils", properties.extensionName) == 0) { - return true; - } - } - - std::cerr << "ggml_vulkan: WARNING: Instance extension VK_EXT_debug_utils not found." << std::endl; - return false; - - UNUSED(instance_extensions); -} - -static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) { - VkPhysicalDeviceFeatures2 device_features2; - device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; - - VkPhysicalDeviceVulkan11Features vk11_features; - vk11_features.pNext = nullptr; - vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; - device_features2.pNext = &vk11_features; - - vkGetPhysicalDeviceFeatures2(vkdev, &device_features2); - - return vk11_features.storageBuffer16BitAccess; -} - -static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) { - switch (props.vendorID) { - case VK_VENDOR_ID_INTEL: - // Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost, - // while some older hardware (ex. Arc A770) has performance regressions - return arch == vk_device_architecture::INTEL_XE2; - case VK_VENDOR_ID_AMD: - if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) { - // Workaround for AMD proprietary driver reporting support on all GPUs - return arch == vk_device_architecture::AMD_RDNA3; - } - return true; - default: - return true; - } -} - -static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev) { - VkPhysicalDeviceProperties2 props = vkdev.getProperties2(); - - if (props.properties.vendorID != VK_VENDOR_ID_INTEL) { - return 0; - } - - const uint32_t device_id = props.properties.deviceID; - - switch (device_id) { - case 0x56A6: // A310 - return 6; - case 0x5693: // A370M - case 0x56A5: // A380 - case 0x56B1: // Pro A40/A50 - return 8; - case 0x5697: // A530M - return 12; - case 0x5692: // A550M - case 0x56B3: // Pro A60 - return 16; - case 0x56A2: // A580 - return 24; - case 0x5691: // A730M - case 0x56A1: // A750 - return 28; - case 0x56A0: // A770 - case 0x5690: // A770M - return 32; - case 0xE212: // Pro B50 - return 16; - case 0xE20C: // B570 - return 18; - case 0xE20B: // B580 - case 0xE211: // Pro B60 - return 20; - default: - return 0; - } -} - -// checks - -#ifdef GGML_VULKAN_CHECK_RESULTS -static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector& done, int level = 0) { - if (std::find(done.begin(), done.end(), tensor) != done.end() || level > 10) { - return; - } - for (int j = 0; j < level; j++) { - std::cerr << " "; - } - std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl; - - done.push_back(tensor); - - for (int i = 0; i < GGML_MAX_SRC; i++) { - if (tensor->src[i] != nullptr) { - ggml_vk_print_graph_origin(tensor->src[i], done, level + 1); - } - } -} - -static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) { - if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16 && tensor->type != GGML_TYPE_I32) { - return; - } - i0 = std::max(i0, 5); - i1 = std::max(i1, 5); - i2 = std::max(i2, 0); - i3 = std::max(i3, 0); - fprintf(stderr, " "); - for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { - fprintf(stderr, "%7d ", idx1); - } - fprintf(stderr, "\n"); - for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) { - fprintf(stderr, "%7d: ", idx0); - for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { - if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) { - float val; - if (tensor->type == GGML_TYPE_F32) { - val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); - } else if (tensor->type == GGML_TYPE_F16) { - val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); - } else if (tensor->type == GGML_TYPE_I32) { - val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); - } else { - GGML_ABORT("fatal error"); - } - fprintf(stderr, "% 7.2f ", val); - } else { - fprintf(stderr, " "); - } - } - fprintf(stderr, "\n"); - } -} - -static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) { - void * tensor_data = tensor->data; - - const bool is_gpu = tensor->buffer != nullptr && ggml_backend_buffer_is_vk(tensor->buffer); - - if (is_gpu) { - const size_t tensor_size = ggml_nbytes(tensor); - tensor_data = malloc(tensor_size); - - ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; - - vk_buffer buffer_gpu = buf_ctx->dev_buffer; - ggml_vk_buffer_read(buffer_gpu, vk_tensor_offset(tensor) + tensor->view_offs, tensor_data, tensor_size); - } - - std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; - std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl; - if (tensor->src[0] != nullptr) { - std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl; - } - if (tensor->src[1] != nullptr) { - std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl; - } - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0); - std::cerr << std::endl; - std::vector done; - ggml_vk_print_graph_origin(tensor, done); - - if (is_gpu) { - free(tensor_data); - } -} - -void * comp_result; -size_t comp_size; -size_t comp_nb[GGML_MAX_DIMS]; -size_t check_counter = 0; -static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) { - ggml_tensor * tensor = cgraph->nodes[tensor_idx + ctx->num_additional_fused_ops]; - if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) { - return; - } - - check_counter++; - if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { - return; - } - - VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")"); - - struct ggml_init_params iparams = { - /*.mem_size =*/ 2ul*1024ul*1024ul*1024ul, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ggml_ctx = ggml_init(iparams); - - std::array src_clone = {nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}; - const char * srci_name[GGML_MAX_SRC] = {"src0", "src1", "src2", "src3", "src4", "src5", "src6", "src7", "src8", "src9"}; - - std::map cloned_tensors; - std::vector cloned_mallocs; - - struct ggml_tensor * tensor_clone = nullptr; - - for (int f = 0; f < ctx->num_additional_fused_ops + 1; ++f) { - tensor = cgraph->nodes[tensor_idx + f]; - for (int i = 0; i < GGML_MAX_SRC; i++) { - ggml_tensor * srci = tensor->src[i]; - if (srci == nullptr) { - continue; - } - // If a src tensor has been cloned, use that one - auto it = cloned_tensors.find(srci); - if (it != cloned_tensors.end()) { - src_clone[i] = it->second; - continue; - } - ggml_tensor * srci_clone = ggml_dup_tensor(ggml_ctx, srci); - size_t srci_size = ggml_nbytes(srci); - - src_clone[i] = srci_clone; - void *src_buffer = malloc(srci_size); - cloned_mallocs.push_back(src_buffer); - - srci_clone->data = src_buffer; - if (ggml_backend_buffer_is_host(srci->buffer)) { - memcpy(srci_clone->data, srci->data, srci_size); - memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS); - } else if (ggml_backend_buffer_is_vk(srci->buffer)) { - ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)srci->buffer->context; - vk_buffer& buffer_gpu = buf_ctx->dev_buffer; - uint64_t offset = vk_tensor_offset(srci) + srci->view_offs; - if (!ggml_is_contiguous(srci) && ggml_vk_dim01_contiguous(srci)) { - for (int i3 = 0; i3 < srci->ne[3]; i3++) { - for (int i2 = 0; i2 < srci->ne[2]; i2++) { - const int idx = i3*srci->ne[2] + i2; - ggml_vk_buffer_read(buffer_gpu, offset + idx * srci->nb[2], ((char *)srci_clone->data + idx * srci_clone->nb[2]), srci->ne[1] * srci->nb[1]); - } - } - - srci_clone->nb[0] = srci->nb[0]; - srci_clone->nb[1] = srci->nb[1]; - for (int i = 2; i < GGML_MAX_DIMS; i++) { - srci_clone->nb[i] = srci_clone->nb[i - 1]*srci_clone->ne[i - 1]; - } - } else { - if (offset + srci_size >= buffer_gpu->size) { - srci_size = buffer_gpu->size - offset; - } - ggml_vk_buffer_read(buffer_gpu, offset, srci_clone->data, srci_size); - memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS); - } - } else { - GGML_ABORT("fatal error"); - } - - if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(srci, srci_name[i]); - } - } - - if (tensor->op == GGML_OP_FLASH_ATTN_EXT) { - const float * params = (const float *)tensor->op_params; - tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]); - if (src_clone[4]) { - ggml_flash_attn_ext_add_sinks(tensor_clone, src_clone[4]); - } - } else if (tensor->op == GGML_OP_MUL_MAT) { - tensor_clone = ggml_mul_mat(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_MUL_MAT_ID) { - tensor_clone = ggml_mul_mat_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]); - } else if (tensor->op == GGML_OP_SUB) { - tensor_clone = ggml_sub(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_MUL) { - tensor_clone = ggml_mul(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_DIV) { - tensor_clone = ggml_div(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_CONCAT) { - tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params); - } else if (tensor->op == GGML_OP_UPSCALE) { - tensor_clone = ggml_interpolate(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]); - } else if (tensor->op == GGML_OP_SCALE) { - const float * params = (const float *)tensor->op_params; - tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]); - } else if (tensor->op == GGML_OP_ADD1) { - tensor_clone = ggml_add1(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_ARANGE) { - const float start = ggml_get_op_params_f32(tensor, 0); - const float stop = ggml_get_op_params_f32(tensor, 1); - const float step = ggml_get_op_params_f32(tensor, 2); - tensor_clone = ggml_arange(ggml_ctx, start, stop, step); - } else if (tensor->op == GGML_OP_FILL) { - const float value = ggml_get_op_params_f32(tensor, 0); - tensor_clone = ggml_fill(ggml_ctx, src_clone[0], value); - } else if (tensor->op == GGML_OP_SQR) { - tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_SQRT) { - tensor_clone = ggml_sqrt(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_SIN) { - tensor_clone = ggml_sin(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_COS) { - tensor_clone = ggml_cos(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_LOG) { - tensor_clone = ggml_log(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_TRI) { - tensor_clone = ggml_tri(ggml_ctx, src_clone[0], (ggml_tri_type)ggml_get_op_params_i32(tensor, 0)); - } else if (tensor->op == GGML_OP_DIAG) { - tensor_clone = ggml_diag(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_CLAMP) { - const float * params = (const float *)tensor->op_params; - tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]); - } else if (tensor->op == GGML_OP_PAD) { - tensor_clone = ggml_pad_ext(ggml_ctx, src_clone[0], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3], - tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]); - } else if (tensor->op == GGML_OP_REPEAT) { - tensor_clone = ggml_repeat(ggml_ctx, src_clone[0], tensor); - } else if (tensor->op == GGML_OP_REPEAT_BACK) { - tensor_clone = ggml_repeat_back(ggml_ctx, src_clone[0], tensor); - } else if (tensor->op == GGML_OP_ADD) { - tensor_clone = ggml_add(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_ACC) { - tensor_clone = ggml_acc(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]); - } else if (tensor->op == GGML_OP_SET) { - tensor_clone = ggml_set(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]); - } else if (tensor->op == GGML_OP_NORM) { - tensor_clone = ggml_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params); - } else if (tensor->op == GGML_OP_GROUP_NORM) { - const float * float_params = (const float *)tensor->op_params; - tensor_clone = ggml_group_norm(ggml_ctx, src_clone[0], tensor->op_params[0], float_params[1]); - } else if (tensor->op == GGML_OP_RMS_NORM) { - tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params); - } else if (tensor->op == GGML_OP_RMS_NORM_BACK) { - const float eps = ((float *) tensor->op_params)[0]; - tensor_clone = ggml_rms_norm_back(ggml_ctx, src_clone[0], src_clone[1], eps); - } else if (tensor->op == GGML_OP_SILU_BACK) { - tensor_clone = ggml_silu_back(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_L2_NORM) { - const float eps = ((float *) tensor->op_params)[0]; - tensor_clone = ggml_l2_norm(ggml_ctx, src_clone[0], eps); - } else if (tensor->op == GGML_OP_SOFT_MAX) { - if (tensor->src[1] != nullptr) { - const float * params = (const float *)tensor->op_params; - tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], params[0], params[1]); - } else { - tensor_clone = ggml_soft_max(ggml_ctx, src_clone[0]); - } - } else if (tensor->op == GGML_OP_SOFT_MAX_BACK) { - tensor_clone = ggml_soft_max_ext_back(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); - } else if (tensor->op == GGML_OP_DIAG_MASK_INF) { - tensor_clone = ggml_diag_mask_inf(ggml_ctx, src_clone[0], tensor->op_params[0]); - } else if (tensor->op == GGML_OP_ROPE || tensor->op == GGML_OP_ROPE_BACK) { - const int n_dims = ((int32_t *) tensor->op_params)[1]; - const int mode = ((int32_t *) tensor->op_params)[2]; - //const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3]; - const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4]; - const float freq_base = ((float *) tensor->op_params)[5]; - const float freq_scale = ((float *) tensor->op_params)[6]; - const float ext_factor = ((float *) tensor->op_params)[7]; - const float attn_factor = ((float *) tensor->op_params)[8]; - const float beta_fast = ((float *) tensor->op_params)[9]; - const float beta_slow = ((float *) tensor->op_params)[10]; - if (mode & GGML_ROPE_TYPE_MROPE) { - int32_t *sections = ((int32_t *) tensor->op_params) + 11; - if (tensor->op == GGML_OP_ROPE) { - tensor_clone = ggml_rope_multi(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); - } else { - tensor_clone = ggml_rope_multi_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); - } - } else { - if (tensor->op == GGML_OP_ROPE) { - tensor_clone = ggml_rope_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); - } else { - tensor_clone = ggml_rope_ext_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); - } - } - } else if (tensor->op == GGML_OP_UNARY) { - switch (ggml_get_unary_op(tensor)) { - case GGML_UNARY_OP_EXP: - tensor_clone = ggml_exp(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_EXPM1: - tensor_clone = ggml_expm1(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_ELU: - tensor_clone = ggml_elu(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_SILU: - tensor_clone = ggml_silu(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_GELU: - tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_GELU_ERF: - tensor_clone = ggml_gelu_erf(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_GELU_QUICK: - tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_RELU: - tensor_clone = ggml_relu(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_XIELU: - tensor_clone = ggml_xielu(ggml_ctx, src_clone[0], 0, 0, 0, 0); - ggml_set_op_params_f32(tensor_clone, 1, ggml_get_op_params_f32(tensor, 1)); - ggml_set_op_params_f32(tensor_clone, 2, ggml_get_op_params_f32(tensor, 2)); - ggml_set_op_params_f32(tensor_clone, 3, ggml_get_op_params_f32(tensor, 3)); - ggml_set_op_params_f32(tensor_clone, 4, ggml_get_op_params_f32(tensor, 4)); - break; - case GGML_UNARY_OP_NEG: - tensor_clone = ggml_neg(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_TANH: - tensor_clone = ggml_tanh(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_SIGMOID: - tensor_clone = ggml_sigmoid(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_HARDSIGMOID: - tensor_clone = ggml_hardsigmoid(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_HARDSWISH: - tensor_clone = ggml_hardswish(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_ABS: - tensor_clone = ggml_abs(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_SOFTPLUS: - tensor_clone = ggml_softplus(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_STEP: - tensor_clone = ggml_step(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_ROUND: - tensor_clone = ggml_round(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_CEIL: - tensor_clone = ggml_ceil(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_FLOOR: - tensor_clone = ggml_floor(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_TRUNC: - tensor_clone = ggml_trunc(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_SGN: - tensor_clone = ggml_sgn(ggml_ctx, src_clone[0]); - break; - default: - std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; - GGML_ABORT("fatal error"); - } - } else if (tensor->op == GGML_OP_GLU) { - if (src_clone[1] == nullptr) { - tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]); - } else { - tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]); - } - ggml_set_op_params_i32(tensor_clone, 2, ggml_get_op_params_i32(tensor, 2)); - ggml_set_op_params_i32(tensor_clone, 3, ggml_get_op_params_i32(tensor, 3)); - } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { - if (tensor->src[1] == nullptr) { - tensor_clone = ggml_dup(ggml_ctx, src_clone[0]); - tensor_clone->type = tensor->type; - } else { - tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]); - } - } else if (tensor->op == GGML_OP_CONT) { - tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); - } else if (tensor->op == GGML_OP_RESHAPE) { - tensor_clone = ggml_reshape_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); - } else if (tensor->op == GGML_OP_VIEW) { - tensor_clone = ggml_view_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]); - } else if (tensor->op == GGML_OP_PERMUTE) { - int32_t * params = (int32_t *)tensor->op_params; - tensor_clone = ggml_permute(ggml_ctx, src_clone[0], params[0], params[1], params[2], params[3]); - } else if (tensor->op == GGML_OP_TRANSPOSE) { - tensor_clone = ggml_transpose(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_GET_ROWS) { - tensor_clone = ggml_get_rows(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_ARGSORT) { - tensor_clone = ggml_argsort(ggml_ctx, src_clone[0], (ggml_sort_order) *(int *)tensor->op_params); - } else if (tensor->op == GGML_OP_TOP_K) { - tensor_clone = ggml_top_k(ggml_ctx, src_clone[0], tensor->ne[0]); - } else if (tensor->op == GGML_OP_SUM) { - tensor_clone = ggml_sum(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_SUM_ROWS) { - tensor_clone = ggml_sum_rows(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_CUMSUM) { - tensor_clone = ggml_cumsum(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_MEAN) { - tensor_clone = ggml_mean(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_ARGMAX) { - tensor_clone = ggml_argmax(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_COUNT_EQUAL) { - tensor_clone = ggml_count_equal(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_SOLVE_TRI) { - tensor_clone = ggml_solve_tri(ggml_ctx, src_clone[0], src_clone[1], true, true, false); - } else if (tensor->op == GGML_OP_IM2COL) { - const int32_t s0 = tensor->op_params[0]; - const int32_t s1 = tensor->op_params[1]; - const int32_t p0 = tensor->op_params[2]; - const int32_t p1 = tensor->op_params[3]; - const int32_t d0 = tensor->op_params[4]; - const int32_t d1 = tensor->op_params[5]; - - const bool is_2D = tensor->op_params[6] == 1; - tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1, is_2D, tensor->type); - } else if (tensor->op == GGML_OP_IM2COL_3D) { - const int32_t s0 = tensor->op_params[0]; - const int32_t s1 = tensor->op_params[1]; - const int32_t s2 = tensor->op_params[2]; - const int32_t p0 = tensor->op_params[3]; - const int32_t p1 = tensor->op_params[4]; - const int32_t p2 = tensor->op_params[5]; - const int32_t d0 = tensor->op_params[6]; - const int32_t d1 = tensor->op_params[7]; - const int32_t d2 = tensor->op_params[8]; - const int32_t IC = tensor->op_params[9]; - - tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type); - } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) { - const int32_t dim = tensor->op_params[0]; - const int32_t max_period = tensor->op_params[1]; - tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period); - } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){ - const int32_t s0 = tensor->op_params[0]; - const int32_t p0 = tensor->op_params[1]; - const int32_t d0 = tensor->op_params[2]; - tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0); - } else if (tensor->op == GGML_OP_COL2IM_1D) { - const int32_t stride = tensor->op_params[0]; - const int32_t oc = tensor->op_params[1]; - const int32_t p0 = tensor->op_params[2]; - tensor_clone = ggml_col2im_1d(ggml_ctx, src_clone[0], stride, oc, p0); - } else if (tensor->op == GGML_OP_POOL_2D) { - enum ggml_op_pool op = static_cast(tensor->op_params[0]); - const int32_t k0 = tensor->op_params[1]; - const int32_t k1 = tensor->op_params[2]; - const int32_t s0 = tensor->op_params[3]; - const int32_t s1 = tensor->op_params[4]; - const int32_t p0 = tensor->op_params[5]; - const int32_t p1 = tensor->op_params[6]; - - tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1); - } else if (tensor->op == GGML_OP_CONV_2D) { - const int32_t s0 = tensor->op_params[0]; - const int32_t s1 = tensor->op_params[1]; - const int32_t p0 = tensor->op_params[2]; - const int32_t p1 = tensor->op_params[3]; - const int32_t d0 = tensor->op_params[4]; - const int32_t d1 = tensor->op_params[5]; - tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1); - } else if (tensor->op == GGML_OP_CONV_2D_DW) { - const int32_t s0 = tensor->op_params[0]; - const int32_t s1 = tensor->op_params[1]; - const int32_t p0 = tensor->op_params[2]; - const int32_t p1 = tensor->op_params[3]; - const int32_t d0 = tensor->op_params[4]; - const int32_t d1 = tensor->op_params[5]; - tensor_clone = ggml_conv_2d_dw_direct(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1); - } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_2D) { - const int32_t s = tensor->op_params[0]; - tensor_clone = ggml_conv_transpose_2d_p0(ggml_ctx, src_clone[0], src_clone[1], s); - } else if (tensor->op == GGML_OP_LEAKY_RELU) { - const float * op_params = (const float *)tensor->op_params; - tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false); - } else if (tensor->op == GGML_OP_RWKV_WKV6) { - tensor_clone = ggml_rwkv_wkv6(ggml_ctx, src_clone[0], src_clone[1], - src_clone[2], src_clone[3], src_clone[4], src_clone[5]); - } else if (tensor->op == GGML_OP_RWKV_WKV7) { - tensor_clone = ggml_rwkv_wkv7(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], - src_clone[4], src_clone[5], src_clone[6]); - } else if (tensor->op == GGML_OP_GATED_DELTA_NET) { - tensor_clone = ggml_gated_delta_net(ggml_ctx, src_clone[0], src_clone[1], - src_clone[2], src_clone[3], src_clone[4], src_clone[5], - ggml_get_op_params_i32(tensor, 0)); - } else if (tensor->op == GGML_OP_OPT_STEP_ADAMW) { - src_clone[0]->flags = tensor->src[0]->flags; - tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1], - src_clone[2], src_clone[3], src_clone[4]); - } else if (tensor->op == GGML_OP_OPT_STEP_SGD) { - src_clone[0]->flags = tensor->src[0]->flags; - tensor_clone = ggml_opt_step_sgd(ggml_ctx, src_clone[0], src_clone[1], - src_clone[2]); - } else if (tensor->op == GGML_OP_ADD_ID) { - tensor_clone = ggml_add_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]); - } else if (tensor->op == GGML_OP_SSM_SCAN) { - tensor_clone = ggml_ssm_scan(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], - src_clone[3], src_clone[4], src_clone[5], src_clone[6]); - } else if (tensor->op == GGML_OP_SSM_CONV) { - tensor_clone = ggml_ssm_conv(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_ROLL) { - const int32_t s0 = tensor->op_params[0]; - const int32_t s1 = tensor->op_params[1]; - const int32_t s2 = tensor->op_params[2]; - const int32_t s3 = tensor->op_params[3]; - tensor_clone = ggml_roll(ggml_ctx, src_clone[0], s0, s1, s2, s3); - } - else { - std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; - GGML_ABORT("fatal error"); - } - cloned_tensors[tensor] = tensor_clone; - } - - ggml_cgraph * cgraph_cpu = ggml_new_graph(ggml_ctx); - ggml_build_forward_expand(cgraph_cpu, tensor_clone); - - ggml_graph_compute_with_ctx(ggml_ctx, cgraph_cpu, 8); - - if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(tensor_clone, "tensor_clone"); - } - - comp_size = ggml_nbytes(tensor_clone); - - comp_result = malloc(comp_size); - memcpy(comp_result, tensor_clone->data, comp_size); - memcpy(comp_nb, tensor_clone->nb, sizeof(size_t) * GGML_MAX_DIMS); - - for (auto m : cloned_mallocs) { - free(m); - } - - ggml_free(ggml_ctx); - - VK_LOG_DEBUG("END ggml_vk_check_results_0(" << tensor->name << ")"); -} - -static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) { - ggml_tensor * tensor = cgraph->nodes[tensor_idx + ctx->num_additional_fused_ops]; - if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) { - return; - } - - if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { - return; - } - - VK_LOG_DEBUG("ggml_vk_check_results_1(" << tensor->name << ")"); - - ggml_tensor * src0 = tensor->src[0]; - ggml_tensor * src1 = tensor->src[1]; - ggml_tensor * src2 = tensor->src[2]; - ggml_tensor * src3 = tensor->src[3]; - - void * tensor_data = tensor->data; - - if (ggml_backend_buffer_is_vk(tensor->buffer)) { - size_t tensor_size = ggml_nbytes(tensor); - tensor_data = malloc(tensor_size); - - ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; - - vk_buffer& buffer_gpu = buf_ctx->dev_buffer; - uint64_t offset = vk_tensor_offset(tensor) + tensor->view_offs; - if (offset + tensor_size >= buffer_gpu->size) { - tensor_size = buffer_gpu->size - offset; - } - - ggml_vk_buffer_read(buffer_gpu, offset, tensor_data, tensor_size); - } - - float first_error_result = -1.0f; - float first_error_correct = -1.0f; - std::array first_error = { -1, -1, -1, -1 }; - double avg_err = 0.0; - size_t counter = 0; - - for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - const bool buffer_size_fit = i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0] < comp_size; - float correct = 0.0f; - float result = 0.0f; - - if (buffer_size_fit) { - if (tensor->type == GGML_TYPE_F32) { - correct = *(float *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]); - result = *(float *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]); - } else if (tensor->type == GGML_TYPE_F16) { - correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0])); - result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0])); - } else if (tensor->type == GGML_TYPE_BF16) { - correct = ggml_bf16_to_fp32(*(ggml_bf16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0])); - result = ggml_bf16_to_fp32(*(ggml_bf16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0])); - } else if (tensor->type == GGML_TYPE_I32) { - correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]); - result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]); - } else if (tensor->type == GGML_TYPE_I64) { - correct = *(int64_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]); - result = *(int64_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]); - } else { - std::cerr << "Results check not implemented for type " << ggml_type_name(tensor->type) << std::endl; - } - } else { - std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl; - GGML_ABORT("fatal error"); - } - - if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) { - std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl; - std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; - if (src0 != nullptr) { - std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; - } - if (src1 != nullptr) { - std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; - } - if (src2 != nullptr) { - std::cerr << "src2=" << src2 << " src2->name=" << src2->name << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl; - } - if (src3 != nullptr) { - std::cerr << "src3=" << src3 << " src3->name=" << src3->name << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl; - } - std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(tensor, tensor_data, i0, i1, i2, i3); - std::cerr << std::endl << "Correct:" << std::endl; - ggml_vk_print_tensor_area(tensor, comp_result, i0, i1, i2, i3); - std::cerr << std::endl; - std::vector done; - ggml_vk_print_graph_origin(tensor, done); - GGML_ABORT("fatal error"); - } - const double denom = std::fabs(correct) > 1.0f ? (std::fabs(correct) > 1e-8 ? std::fabs(correct) : 1e-8) : 1.0f; - if (first_error[0] == -1 && std::fabs(correct - result) / denom > 0.5) { - first_error[0] = i0; - first_error[1] = i1; - first_error[2] = i2; - first_error[3] = i3; - first_error_result = result; - first_error_correct = correct; - } - - // Special case, value is infinite, avoid NaN result in avg_err - // NaN also appears in results, if both are nan error is 0 - if (!std::isinf(correct) && !std::isinf(result) && !std::isnan(correct) && !std::isnan(result)) { - avg_err += std::fabs(correct - result) / denom; - } - counter++; - } - } - } - } - - avg_err /= counter; - - if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl; - std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; - if (src0 != nullptr) { - std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; - } - if (src1 != nullptr) { - std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; - } - if (src2 != nullptr) { - std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl; - } - if (src3 != nullptr) { - std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl; - } - std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0); - std::cerr << std::endl << "Correct:" << std::endl; - ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 0, 0); - std::cerr << std::endl; - std::vector done; - ggml_vk_print_graph_origin(tensor, done); - } - - if (avg_err > 0.01 || std::isnan(avg_err)) { - std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl; - std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; - if (src0 != nullptr) { - std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; - } - if (src1 != nullptr) { - std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; - } - if (src2 != nullptr) { - std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl; - } - if (src3 != nullptr) { - std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl; - } - std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(tensor, tensor_data, first_error[0], first_error[1], first_error[2], first_error[3]); - std::cerr << std::endl << "Correct:" << std::endl; - ggml_vk_print_tensor_area(tensor, comp_result, first_error[0], first_error[1], first_error[2], first_error[3]); - std::cerr << std::endl; - std::vector done; - ggml_vk_print_graph_origin(tensor, done); - GGML_ABORT("fatal error"); - } else { - std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl; - } - - free(comp_result); - comp_result = nullptr; - comp_size = 0; - - if (ggml_backend_buffer_is_vk(tensor->buffer)) { - free(tensor_data); - } - - VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")"); -} -#endif - -GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg)