mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
vulkan: split ggml-vulkan.cpp file
This commit is contained in:
parent
f8cc15f163
commit
037397792a
@ -60,7 +60,19 @@ if (Vulkan_FOUND)
|
||||
message(STATUS "Vulkan found")
|
||||
|
||||
ggml_add_backend_library(ggml-vulkan
|
||||
ggml-vulkan.cpp
|
||||
ggml-vulkan-instance.cpp
|
||||
ggml-vulkan-shaders.cpp
|
||||
ggml-vulkan-buffers.cpp
|
||||
ggml-vulkan-pipelines.cpp
|
||||
ggml-vulkan-matmul.cpp
|
||||
ggml-vulkan-flash-attn.cpp
|
||||
ggml-vulkan-operators.cpp
|
||||
ggml-vulkan-graph.cpp
|
||||
ggml-vulkan-backend.cpp
|
||||
ggml-vulkan-debug.cpp
|
||||
ggml-vulkan-types.h
|
||||
ggml-vulkan-push-constants.h
|
||||
ggml-vulkan-common.h
|
||||
../../include/ggml-vulkan.h
|
||||
)
|
||||
|
||||
|
||||
1867
ggml/src/ggml-vulkan/ggml-vulkan-backend.cpp
Normal file
1867
ggml/src/ggml-vulkan/ggml-vulkan-backend.cpp
Normal file
File diff suppressed because it is too large
Load Diff
783
ggml/src/ggml-vulkan/ggml-vulkan-buffers.cpp
Normal file
783
ggml/src/ggml-vulkan/ggml-vulkan-buffers.cpp
Normal file
@ -0,0 +1,783 @@
|
||||
#include "ggml-vulkan-common.h"
|
||||
|
||||
ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
||||
/* .get_name = */ ggml_backend_vk_buffer_type_name,
|
||||
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ NULL,
|
||||
};
|
||||
|
||||
static std::vector<uint32_t> ggml_vk_find_memory_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
|
||||
std::vector<uint32_t> indices;
|
||||
|
||||
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
|
||||
vk::MemoryType memory_type = mem_props->memoryTypes[i];
|
||||
if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
|
||||
(flags & memory_type.propertyFlags) == flags &&
|
||||
mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
|
||||
indices.push_back(i);
|
||||
}
|
||||
}
|
||||
return indices;
|
||||
}
|
||||
|
||||
static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list<vk::MemoryPropertyFlags> & req_flags_list,
|
||||
void *import_ptr = nullptr) {
|
||||
VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")");
|
||||
if (size > device->max_buffer_size) {
|
||||
throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device buffer size limit");
|
||||
}
|
||||
|
||||
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
||||
|
||||
if (size == 0) {
|
||||
buf->size = 0;
|
||||
return buf;
|
||||
}
|
||||
|
||||
vk::BufferUsageFlags usage_flags = vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst;
|
||||
vk::MemoryAllocateFlags mem_flags {};
|
||||
if (device->buffer_device_address) {
|
||||
usage_flags |= vk::BufferUsageFlagBits::eShaderDeviceAddress;
|
||||
mem_flags |= vk::MemoryAllocateFlagBits::eDeviceAddress;
|
||||
}
|
||||
|
||||
vk::BufferCreateInfo buffer_create_info{
|
||||
vk::BufferCreateFlags(),
|
||||
size,
|
||||
usage_flags,
|
||||
vk::SharingMode::eExclusive,
|
||||
0,
|
||||
nullptr,
|
||||
};
|
||||
|
||||
vk::ExternalMemoryBufferCreateInfo external_memory_bci;
|
||||
if (import_ptr) {
|
||||
external_memory_bci.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT;
|
||||
buffer_create_info.setPNext(&external_memory_bci);
|
||||
}
|
||||
|
||||
buf->buffer = device->device.createBuffer(buffer_create_info);
|
||||
|
||||
vk::MemoryRequirements mem_req = device->device.getBufferMemoryRequirements(buf->buffer);
|
||||
|
||||
vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties();
|
||||
|
||||
const vk::MemoryPriorityAllocateInfoEXT mem_priority_info { 1.0f };
|
||||
|
||||
vk::MemoryAllocateFlagsInfo mem_flags_info { mem_flags };
|
||||
|
||||
if (device->memory_priority) {
|
||||
mem_flags_info.setPNext(&mem_priority_info);
|
||||
}
|
||||
|
||||
if (import_ptr) {
|
||||
vk::MemoryHostPointerPropertiesEXT host_pointer_props;
|
||||
try {
|
||||
host_pointer_props = device->device.getMemoryHostPointerPropertiesEXT(vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, import_ptr);
|
||||
} catch (vk::SystemError& e) {
|
||||
GGML_LOG_WARN("ggml_vulkan: Failed getMemoryHostPointerPropertiesEXT (%s)\n", e.what());
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
return {};
|
||||
}
|
||||
vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties();
|
||||
|
||||
uint32_t memory_type_idx;
|
||||
vk::MemoryPropertyFlags property_flags = *req_flags_list.begin();
|
||||
for (memory_type_idx = 0; memory_type_idx < 32; ++memory_type_idx) {
|
||||
if (!(host_pointer_props.memoryTypeBits & (1u << memory_type_idx))) {
|
||||
continue;
|
||||
}
|
||||
if (!(mem_req.memoryTypeBits & (1u << memory_type_idx))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
vk::MemoryType memory_type = mem_props.memoryTypes[memory_type_idx];
|
||||
// check for visible+coherent+cached. Other flags (e.g. devicelocal) are allowed
|
||||
if ((memory_type.propertyFlags & property_flags) == property_flags) {
|
||||
property_flags = memory_type.propertyFlags;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (memory_type_idx == 32) {
|
||||
GGML_LOG_WARN("ggml_vulkan: Memory type for host allocation not found\n");
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
return {};
|
||||
}
|
||||
|
||||
buf->memory_property_flags = mem_props.memoryTypes[memory_type_idx].propertyFlags;
|
||||
try {
|
||||
vk::ImportMemoryHostPointerInfoEXT import_info;
|
||||
import_info.handleType = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT;
|
||||
import_info.pHostPointer = import_ptr;
|
||||
import_info.setPNext(&mem_flags_info);
|
||||
buf->device_memory = device->device.allocateMemory({ size, memory_type_idx, &import_info });
|
||||
} catch (const vk::SystemError& e) {
|
||||
}
|
||||
} else {
|
||||
for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) {
|
||||
const auto & req_flags = *it;
|
||||
|
||||
const std::vector<uint32_t> memory_type_indices = ggml_vk_find_memory_properties(&mem_props, &mem_req, req_flags);
|
||||
|
||||
if (memory_type_indices.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool done = false;
|
||||
|
||||
for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) {
|
||||
try {
|
||||
buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info });
|
||||
buf->memory_property_flags = mem_props.memoryTypes[*mtype_it].propertyFlags;
|
||||
done = true;
|
||||
break;
|
||||
} catch (const vk::SystemError& e) {
|
||||
// loop and retry
|
||||
// during last attempt throw the exception
|
||||
if (it + 1 == req_flags_list.end() && mtype_it + 1 == memory_type_indices.end()) {
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!buf->device_memory) {
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
||||
}
|
||||
|
||||
buf->ptr = nullptr;
|
||||
|
||||
if (import_ptr) {
|
||||
buf->ptr = import_ptr;
|
||||
} else {
|
||||
if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
buf->ptr = device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
|
||||
|
||||
buf->device = device;
|
||||
buf->size = size;
|
||||
|
||||
if (device->buffer_device_address) {
|
||||
const vk::BufferDeviceAddressInfo addressInfo(buf->buffer);
|
||||
buf->bda_addr = device->device.getBufferAddress(addressInfo);
|
||||
}
|
||||
|
||||
device->memory_logger->log_allocation(buf, size);
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
vk_buffer ggml_vk_create_buffer_check(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags) {
|
||||
try {
|
||||
return ggml_vk_create_buffer(device, size, {req_flags, fallback_flags});
|
||||
} catch (const vk::SystemError& e) {
|
||||
std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
|
||||
vk_buffer buf;
|
||||
try {
|
||||
if (device->prefer_host_memory) {
|
||||
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
|
||||
vk::MemoryPropertyFlagBits::eDeviceLocal});
|
||||
} else if (device->uma) {
|
||||
// On UMA, prefer host-visible memory so direct tensor borrowing works.
|
||||
// If unavailable, fall back to device-local memory.
|
||||
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
|
||||
vk::MemoryPropertyFlagBits::eDeviceLocal,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
|
||||
} else if (device->disable_host_visible_vidmem) {
|
||||
if (device->allow_sysmem_fallback) {
|
||||
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
|
||||
} else {
|
||||
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal});
|
||||
}
|
||||
} else {
|
||||
// use rebar if available, otherwise fallback to device only visible memory
|
||||
if (device->allow_sysmem_fallback) {
|
||||
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
|
||||
vk::MemoryPropertyFlagBits::eDeviceLocal,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
|
||||
} else {
|
||||
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
|
||||
vk::MemoryPropertyFlagBits::eDeviceLocal});
|
||||
}
|
||||
}
|
||||
} catch (const vk::SystemError& e) {
|
||||
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
throw e;
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
||||
if (buf == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (buf->device != nullptr) {
|
||||
buf->device->memory_logger->log_deallocation(buf);
|
||||
}
|
||||
|
||||
buf.reset();
|
||||
}
|
||||
|
||||
void * ggml_vk_host_malloc(vk_device& device, size_t size) {
|
||||
VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")");
|
||||
vk_buffer buf = ggml_vk_create_buffer(device, size,
|
||||
{vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
|
||||
|
||||
if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
|
||||
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
|
||||
size/1024.0/1024.0);
|
||||
device->device.freeMemory(buf->device_memory);
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::lock_guard<std::shared_mutex> guard(device->pinned_memory_mutex);
|
||||
device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf));
|
||||
|
||||
return buf->ptr;
|
||||
}
|
||||
|
||||
void ggml_vk_host_free(vk_device& device, void* ptr) {
|
||||
if (ptr == nullptr) {
|
||||
return;
|
||||
}
|
||||
VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")");
|
||||
std::lock_guard<std::shared_mutex> guard(device->pinned_memory_mutex);
|
||||
|
||||
vk_buffer buf;
|
||||
size_t index;
|
||||
for (size_t i = 0; i < device->pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(device->pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(device->pinned_memory[i]);
|
||||
if (ptr >= addr && ptr < endr) {
|
||||
buf = std::get<2>(device->pinned_memory[i]);
|
||||
index = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (buf == nullptr) {
|
||||
fprintf(stderr, "WARNING: failed to free pinned memory: memory not in map\n");
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_vk_destroy_buffer(buf);
|
||||
|
||||
device->pinned_memory.erase(device->pinned_memory.begin() + index);
|
||||
}
|
||||
|
||||
void ggml_vk_host_get(const vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) {
|
||||
std::shared_lock<std::shared_mutex> guard(device->pinned_memory_mutex);
|
||||
buf = nullptr;
|
||||
buf_offset = 0;
|
||||
for (size_t i = 0; i < device->pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(device->pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(device->pinned_memory[i]);
|
||||
if (ptr >= addr && ptr < endr) {
|
||||
buf = std::get<2>(device->pinned_memory[i]);
|
||||
buf_offset = ((const uint8_t *)ptr) - addr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) {
|
||||
if (device->sync_staging == nullptr || device->sync_staging->size < size) {
|
||||
VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
|
||||
ggml_vk_destroy_buffer(device->sync_staging);
|
||||
device->sync_staging = ggml_vk_create_buffer_check(device, size,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
|
||||
if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
|
||||
VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
|
||||
ggml_vk_destroy_buffer(ctx->sync_staging);
|
||||
ctx->sync_staging = ggml_vk_create_buffer_check(ctx->device, size,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
|
||||
GGML_ASSERT(!ggml_is_contiguous(tensor));
|
||||
// Buffer is already mapped
|
||||
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
// Check if src is pinned memory
|
||||
vk_buffer buf = nullptr;
|
||||
size_t buf_offset = 0;
|
||||
ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset);
|
||||
|
||||
const uint64_t ne0 = tensor->ne[0];
|
||||
const uint64_t ne1 = tensor->ne[1];
|
||||
const uint64_t ne2 = tensor->ne[2];
|
||||
const uint64_t ne3 = tensor->ne[3];
|
||||
const uint64_t nb0 = tensor->nb[0];
|
||||
const uint64_t nb1 = tensor->nb[1];
|
||||
const uint64_t nb2 = tensor->nb[2];
|
||||
const uint64_t nb3 = tensor->nb[3];
|
||||
const ggml_type type = tensor->type;
|
||||
const uint64_t ts = ggml_type_size(type);
|
||||
const uint64_t bs = ggml_blck_size(type);
|
||||
|
||||
const uint64_t dstnb0 = ts;
|
||||
const uint64_t dstnb1 = dstnb0*(ne0/bs);
|
||||
const uint64_t dstnb2 = dstnb1*ne1;
|
||||
const uint64_t dstnb3 = dstnb2*ne2;
|
||||
|
||||
const uint64_t ne = ggml_nelements(tensor);
|
||||
|
||||
if (buf != nullptr) {
|
||||
// Memory is pinned, use as staging buffer
|
||||
std::vector<vk::BufferCopy> slices;
|
||||
|
||||
for (uint64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (uint64_t i2 = 0; i2 < ne2; i2++) {
|
||||
// Find longest contiguous slice
|
||||
if (ne1*nb1 == dstnb2) {
|
||||
slices.push_back({ buf_offset + i3*nb3 + i2*nb2, offset + i3*dstnb3 + i2*dstnb2, dstnb2 });
|
||||
} else {
|
||||
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
||||
if (ne0*nb0/bs == dstnb1) {
|
||||
slices.push_back({ buf_offset + i3*nb3 + i2*nb2 + i1*nb1, offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, dstnb1 });
|
||||
} else {
|
||||
const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
|
||||
const uint64_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
|
||||
for (uint64_t i0 = 0; i0 < ne0; i0++) {
|
||||
slices.push_back({ s_off + i0*nb0, d_off + i0*dstnb0, dstnb0 });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!sync_staging) {
|
||||
GGML_ABORT("Asynchronous write to non-pinned memory not supported");
|
||||
}
|
||||
|
||||
// Staging buffer required
|
||||
vk_buffer& staging = ctx->device->sync_staging;
|
||||
const uint64_t copy_size = ts*ne/bs;
|
||||
ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size);
|
||||
VkBufferCopy buf_copy{ 0, offset, copy_size };
|
||||
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
vkCmdCopyBuffer(subctx->s->buffer->buf, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
||||
|
||||
for (uint64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (uint64_t i2 = 0; i2 < ne2; i2++) {
|
||||
// Find longest contiguous slice
|
||||
if (ne1*nb1 == dstnb2) {
|
||||
deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys);
|
||||
} else {
|
||||
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
||||
if (ne0*nb0/bs == dstnb1) {
|
||||
deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys);
|
||||
} else {
|
||||
const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
|
||||
const uint64_t d_off = i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
|
||||
for (uint64_t i0 = 0; i0 < ne0; i0++) {
|
||||
deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging) {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
|
||||
// Check if src is pinned memory
|
||||
vk_buffer buf = nullptr;
|
||||
size_t buf_offset = 0;
|
||||
ggml_vk_host_get(dst->device, src, buf, buf_offset);
|
||||
|
||||
if (buf != nullptr) {
|
||||
// Memory is pinned, use as staging buffer
|
||||
std::vector<vk::BufferCopy> slices(1);
|
||||
if (width == spitch && width == dpitch) {
|
||||
// Only do single write if stride is equal
|
||||
slices[0].srcOffset = buf_offset;
|
||||
slices[0].dstOffset = offset;
|
||||
slices[0].size = width * height;
|
||||
} else {
|
||||
slices.resize(height);
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
slices[i].srcOffset = buf_offset + i * spitch;
|
||||
slices[i].dstOffset = offset + i * dpitch;
|
||||
slices[i].size = width;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(nullptr, subctx);
|
||||
subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices);
|
||||
return true;
|
||||
}
|
||||
VK_LOG_DEBUG("STAGING");
|
||||
|
||||
if (!sync_staging) {
|
||||
// copy was not handled caller needs to fall back
|
||||
return false;
|
||||
}
|
||||
|
||||
// Staging buffer required
|
||||
const size_t staging_size = width * height;
|
||||
ggml_vk_ensure_sync_staging_buffer(dst->device, staging_size);
|
||||
|
||||
vk_buffer& staging_buffer = dst->device->sync_staging;
|
||||
|
||||
std::vector<vk::BufferCopy> slices(1);
|
||||
if (width == dpitch) {
|
||||
slices[0].srcOffset = 0;
|
||||
slices[0].dstOffset = offset;
|
||||
slices[0].size = staging_size;
|
||||
} else {
|
||||
slices.resize(height);
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
slices[i].srcOffset = i * width;
|
||||
slices[i].dstOffset = offset + i * dpitch;
|
||||
slices[i].size = width;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(nullptr, subctx);
|
||||
subctx->s->buffer->buf.copyBuffer((VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, slices);
|
||||
|
||||
if (width == spitch) {
|
||||
deferred_memcpy((uint8_t *)staging_buffer->ptr, src, staging_size, &subctx->in_memcpys);
|
||||
} else {
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
deferred_memcpy((uint8_t *)staging_buffer->ptr + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging) {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
|
||||
return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, size, 1, sync_staging);
|
||||
}
|
||||
|
||||
void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height) {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")");
|
||||
// Buffer is already mapped
|
||||
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
|
||||
if (width == spitch && width == dpitch) {
|
||||
memcpy((uint8_t *)dst->ptr + offset, src, width * height);
|
||||
} else {
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
memcpy((uint8_t *)dst->ptr + offset + i * dpitch, (const uint8_t *) src + i * spitch, width);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
|
||||
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dst->device, subctx);
|
||||
bool ret = ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, dpitch, width, height, true);
|
||||
GGML_ASSERT(ret);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
|
||||
for (auto& cpy : subctx->in_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
|
||||
for (auto& mset : subctx->memsets) {
|
||||
memset(mset.dst, mset.val, mset.n);
|
||||
}
|
||||
|
||||
ggml_vk_submit(subctx, dst->device->fence);
|
||||
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
||||
dst->device->device.resetFences({ dst->device->fence });
|
||||
ggml_vk_queue_command_pools_cleanup(dst->device);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src, size_t size) {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")");
|
||||
ggml_vk_buffer_write_2d(dst, offset, src, size, size, size, 1);
|
||||
}
|
||||
|
||||
bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging) {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
|
||||
GGML_ASSERT(width > 0);
|
||||
GGML_ASSERT(height > 0);
|
||||
GGML_ASSERT(src != nullptr);
|
||||
|
||||
// TODO: staging_offset is not used
|
||||
|
||||
// Check if dst is pinned memory
|
||||
vk_buffer buf = nullptr;
|
||||
size_t buf_offset = 0;
|
||||
ggml_vk_host_get(src->device, dst, buf, buf_offset);
|
||||
|
||||
std::vector<vk::BufferCopy> slices(1);
|
||||
if (width == spitch && width == dpitch) {
|
||||
// Only do single write if stride is equal
|
||||
slices[0].srcOffset = offset;
|
||||
slices[0].dstOffset = buf_offset;
|
||||
slices[0].size = width * height;
|
||||
} else {
|
||||
slices.resize(height);
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
slices[i].srcOffset = offset + i * spitch;
|
||||
slices[i].dstOffset = buf_offset + i * dpitch;
|
||||
slices[i].size = width;
|
||||
}
|
||||
}
|
||||
|
||||
if (buf != nullptr) {
|
||||
// Memory is pinned, use as staging buffer
|
||||
ggml_vk_sync_buffers(nullptr, subctx);
|
||||
subctx->s->buffer->buf.copyBuffer(src->buffer, buf->buffer, slices);
|
||||
|
||||
return true;
|
||||
}
|
||||
VK_LOG_DEBUG("STAGING");
|
||||
|
||||
if (!sync_staging) {
|
||||
// copy was not handled caller needs to fall back
|
||||
return false;
|
||||
}
|
||||
|
||||
// Fall back to staging buffer
|
||||
const size_t staging_size = width * height;
|
||||
ggml_vk_ensure_sync_staging_buffer(src->device, staging_size);
|
||||
|
||||
vk_buffer& staging_buffer = src->device->sync_staging;
|
||||
|
||||
std::vector<vk::BufferCopy> staging_slices(1);
|
||||
if (width == spitch) {
|
||||
staging_slices[0].srcOffset = offset;
|
||||
staging_slices[0].dstOffset = 0;
|
||||
staging_slices[0].size = staging_size;
|
||||
} else {
|
||||
staging_slices.resize(height);
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
staging_slices[i].srcOffset = offset + i * spitch;
|
||||
staging_slices[i].dstOffset = i * width;
|
||||
staging_slices[i].size = width;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(nullptr, subctx);
|
||||
subctx->s->buffer->buf.copyBuffer(src->buffer, staging_buffer->buffer, staging_slices);
|
||||
|
||||
if (width == dpitch) {
|
||||
deferred_memcpy(dst, staging_buffer->ptr, staging_size, &subctx->out_memcpys);
|
||||
} else {
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
deferred_memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) staging_buffer->ptr + i * width, width, &subctx->out_memcpys);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t size, bool sync_staging = false) {
|
||||
return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, sync_staging);
|
||||
}
|
||||
|
||||
void ggml_vk_buffer_read_2d(vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height) {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_read_2d(" << src->buffer << ", " << offset << ", " << width << ", " << height << ")");
|
||||
|
||||
// If the device is not an UMA device the memory is host-accessible through rebar. While writing
|
||||
// through PCIe is sufficient fast reading back data from PCIe is slower than going through
|
||||
// the HW device to host copy path.
|
||||
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
|
||||
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
|
||||
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
|
||||
vk_context subctx = ggml_vk_create_temporary_context(src->device->compute_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(src->device, subctx);
|
||||
subctx->s->buffer->buf.pipelineBarrier(
|
||||
vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer,
|
||||
vk::PipelineStageFlagBits::eHost,
|
||||
{},
|
||||
{ { vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferWrite,
|
||||
vk::AccessFlagBits::eHostRead } },
|
||||
{}, {});
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, src->device->fence);
|
||||
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX),
|
||||
"vk_buffer_read_2d uma waitForFences");
|
||||
src->device->device.resetFences({ src->device->fence });
|
||||
ggml_vk_queue_command_pools_cleanup(src->device);
|
||||
|
||||
if (width == spitch && width == dpitch) {
|
||||
memcpy(dst, (const uint8_t *) src->ptr + offset, width * height);
|
||||
} else {
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) src->ptr + offset + i * spitch, width);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
|
||||
|
||||
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(src->device, subctx);
|
||||
bool ret = ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, spitch, dpitch, width, height, true);
|
||||
GGML_ASSERT(ret);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
|
||||
ggml_vk_submit(subctx, src->device->fence);
|
||||
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read_2d waitForFences");
|
||||
src->device->device.resetFences({ src->device->fence });
|
||||
ggml_vk_queue_command_pools_cleanup(src->device);
|
||||
|
||||
for (auto& cpy : subctx->out_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")");
|
||||
ggml_vk_buffer_read_2d(src, offset, dst, size, size, size, 1);
|
||||
}
|
||||
|
||||
void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
|
||||
// Make sure both buffers are on same device
|
||||
GGML_ASSERT(src->device == dst->device);
|
||||
|
||||
VkBufferCopy bc{ src_offset, dst_offset, size };
|
||||
|
||||
vkCmdCopyBuffer(ctx->s->buffer->buf, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc);
|
||||
}
|
||||
|
||||
void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
||||
if (src->device == dst->device) {
|
||||
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
|
||||
// Copy within the device
|
||||
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(src->device, subctx);
|
||||
ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, src->device->fence);
|
||||
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
|
||||
src->device->device.resetFences({ src->device->fence });
|
||||
ggml_vk_queue_command_pools_cleanup(src->device);
|
||||
} else {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
|
||||
// Copy device to device
|
||||
ggml_vk_ensure_sync_staging_buffer(src->device, size);
|
||||
|
||||
// Copy to src staging buffer
|
||||
ggml_vk_buffer_copy(src->device->sync_staging, 0, src, src_offset, size);
|
||||
// Copy to dst buffer
|
||||
ggml_vk_buffer_write(dst, dst_offset, src->device->sync_staging->ptr, size);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_memset_async(" << offset << ", " << c << ", " << size << ")");
|
||||
|
||||
if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
|
||||
dst->device->uma) {
|
||||
deferred_memset((uint8_t*)dst->ptr + offset, c, size, &ctx->memsets);
|
||||
return;
|
||||
}
|
||||
|
||||
// Fall back to GPU fillBuffer for non-UMA or non-host-visible buffers
|
||||
ctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c);
|
||||
}
|
||||
|
||||
void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
|
||||
|
||||
if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
|
||||
dst->device->uma) {
|
||||
memset((uint8_t*)dst->ptr + offset, c, size);
|
||||
return;
|
||||
}
|
||||
|
||||
std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dst->device, subctx);
|
||||
subctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
|
||||
ggml_vk_submit(subctx, dst->device->fence);
|
||||
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
|
||||
dst->device->device.resetFences({ dst->device->fence });
|
||||
ggml_vk_queue_command_pools_cleanup(dst->device);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
|
||||
/* .free_buffer = */ ggml_backend_vk_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_vk_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_vk_buffer_init_tensor,
|
||||
/* .memset_tensor = */ ggml_backend_vk_buffer_memset_tensor,
|
||||
/* .set_tensor = */ ggml_backend_vk_buffer_set_tensor,
|
||||
/* .get_tensor = */ ggml_backend_vk_buffer_get_tensor,
|
||||
/* .set_tensor_2d = */ ggml_backend_vk_buffer_set_tensor_2d,
|
||||
/* .get_tensor_2d = */ ggml_backend_vk_buffer_get_tensor_2d,
|
||||
/* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor,
|
||||
/* .clear = */ ggml_backend_vk_buffer_clear,
|
||||
/* .reset = */ NULL,
|
||||
};
|
||||
|
||||
vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size) {
|
||||
if (!device->external_memory_host) {
|
||||
return {};
|
||||
}
|
||||
|
||||
uintptr_t uptr = reinterpret_cast<uintptr_t>(ptr);
|
||||
if (uptr & (device->min_imported_host_pointer_alignment - 1)) {
|
||||
return {};
|
||||
}
|
||||
if (size & (device->min_imported_host_pointer_alignment - 1)) {
|
||||
return {};
|
||||
}
|
||||
|
||||
const vk::MemoryPropertyFlags property_flags = vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached;
|
||||
|
||||
vk_buffer buf {};
|
||||
try {
|
||||
buf = ggml_vk_create_buffer(device, size, { property_flags }, ptr);
|
||||
} catch (vk::SystemError& e) {
|
||||
GGML_LOG_WARN("ggml_vulkan: Failed ggml_vk_create_buffer (%s)\n", e.what());
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
228
ggml/src/ggml-vulkan/ggml-vulkan-common.h
Normal file
228
ggml/src/ggml-vulkan/ggml-vulkan-common.h
Normal file
@ -0,0 +1,228 @@
|
||||
#pragma once
|
||||
#include "ggml-vulkan-push-constants.h"
|
||||
|
||||
extern std::mutex queue_mutex;
|
||||
extern ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface;
|
||||
extern bool vk_memory_logger_enabled;
|
||||
extern bool vk_perf_logger_enabled;
|
||||
extern bool vk_perf_logger_concurrent;
|
||||
extern bool vk_enable_sync_logger;
|
||||
extern uint32_t vk_perf_logger_frequency;
|
||||
extern std::string vk_pipeline_stats_filter;
|
||||
extern void * const vk_ptr_base;
|
||||
extern vk_instance_t vk_instance;
|
||||
extern ggml_backend_buffer_i ggml_backend_vk_buffer_interface;
|
||||
|
||||
uint64_t vk_tensor_offset(const ggml_tensor * tensor);
|
||||
uint32_t get_misalign_bytes(const ggml_backend_vk_context * ctx, const ggml_tensor * t);
|
||||
void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx);
|
||||
void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
|
||||
void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n);
|
||||
void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx);
|
||||
void ggml_vk_submit(vk_context& ctx, vk::Fence fence);
|
||||
uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues);
|
||||
void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags, bool transfer_only);
|
||||
vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p);
|
||||
vk_context ggml_vk_create_temporary_context(vk_command_pool& p);
|
||||
void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p);
|
||||
void ggml_vk_queue_command_pools_cleanup(vk_device& device);
|
||||
vk_buffer ggml_vk_create_buffer_check(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0));
|
||||
vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size);
|
||||
void ggml_vk_destroy_buffer(vk_buffer& buf);
|
||||
vk_subbuffer ggml_vk_subbuffer(const ggml_backend_vk_context* ctx, const vk_buffer& buf, size_t offset = 0);
|
||||
void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx);
|
||||
void ggml_vk_set_event(vk_context& ctx, vk::Event& event);
|
||||
void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events);
|
||||
vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc);
|
||||
vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc, bool use_mask, bool use_mask_opt, bool use_logit_softcap, ggml_type k_type, ggml_type v_type);
|
||||
uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch);
|
||||
void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested = nullptr);
|
||||
vk_device ggml_vk_get_device(size_t idx);
|
||||
DispatchLoaderDynamic & ggml_vk_default_dispatcher();
|
||||
void ggml_vk_instance_init();
|
||||
void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx);
|
||||
vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type);
|
||||
void * ggml_vk_host_malloc(vk_device& device, size_t size);
|
||||
void ggml_vk_host_free(vk_device& device, void* ptr);
|
||||
void ggml_vk_host_get(const vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset);
|
||||
vk_subbuffer ggml_vk_tensor_subbuffer( const ggml_backend_vk_context * ctx, const ggml_tensor * tensor, bool allow_misalign = false);
|
||||
void ggml_vk_ctx_end(vk_context& ctx);
|
||||
void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx);
|
||||
vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx);
|
||||
bool ggml_vk_submit_transfer_ctx(ggml_backend_vk_context * ctx);
|
||||
size_t ggml_vk_align_size(size_t width, size_t align);
|
||||
void deferred_memcpy(void * dst, const void * src, size_t size, std::vector<vk_staging_memcpy>* memcpys = nullptr);
|
||||
void deferred_memset(void * dst, uint32_t val, size_t size, std::vector<vk_staging_memset>* memsets = nullptr);
|
||||
void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size);
|
||||
void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size);
|
||||
bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false);
|
||||
bool ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false);
|
||||
void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height);
|
||||
void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src, size_t size);
|
||||
bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false);
|
||||
void ggml_vk_buffer_read_2d(vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height);
|
||||
void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size);
|
||||
void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size);
|
||||
void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size);
|
||||
void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size);
|
||||
void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size);
|
||||
void ggml_vk_matmul( ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t padded_n);
|
||||
bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor);
|
||||
vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to);
|
||||
void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, const vk_subbuffer & in, const vk_subbuffer & out);
|
||||
vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type);
|
||||
void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& subctx, const vk_subbuffer & in, const vk_subbuffer & out, uint32_t ne);
|
||||
bool ggml_vk_can_use_fwht(const ggml_backend_vk_context * ctx, const ggml_tensor * src1, const ggml_tensor * dst);
|
||||
void ggml_vk_fwht(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src, ggml_tensor * dst);
|
||||
void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx);
|
||||
bool ggml_vk_use_mul_mat_vec_id(const struct ggml_cgraph * cgraph, int node_idx);
|
||||
void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx);
|
||||
bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type, ggml_type v_type);
|
||||
bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type = GGML_TYPE_F16);
|
||||
void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst);
|
||||
void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx);
|
||||
void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst);
|
||||
void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst);
|
||||
void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst);
|
||||
void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst);
|
||||
void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst);
|
||||
void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx);
|
||||
void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst);
|
||||
void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst);
|
||||
void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_add1(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_arange(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst);
|
||||
void ggml_vk_fill(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst);
|
||||
void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_log(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
uint32_t ggml_vk_rms_partials_size(ggml_backend_vk_context * ctx, const ggml_tensor *node);
|
||||
void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, float * op_params);
|
||||
void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_xielu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst);
|
||||
void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx);
|
||||
void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop);
|
||||
void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_cumsum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_solve_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_col2im_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_snake_dispatch_fused(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx);
|
||||
void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
|
||||
void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx);
|
||||
bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool last_node, bool almost_ready, bool submit);
|
||||
void ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool almost_ready);
|
||||
void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx);
|
||||
void ggml_vk_cleanup(ggml_backend_vk_context * ctx);
|
||||
int ggml_vk_get_device_count();
|
||||
void ggml_vk_get_device_description(int device, char * description, size_t description_size);
|
||||
bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer);
|
||||
void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer);
|
||||
void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer);
|
||||
enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor);
|
||||
void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||
void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void ggml_backend_vk_buffer_set_tensor_2d(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
|
||||
void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
void ggml_backend_vk_buffer_get_tensor_2d(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
|
||||
bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst);
|
||||
void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value);
|
||||
const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
|
||||
ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
|
||||
size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
|
||||
size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft);
|
||||
size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor);
|
||||
void ggml_backend_vk_free(ggml_backend_t backend);
|
||||
void ggml_vk_synchronize(ggml_backend_vk_context * ctx);
|
||||
bool ggml_vk_is_empty(ggml_tensor * node);
|
||||
bool ggml_vk_can_fuse(const ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops);
|
||||
bool ggml_vk_can_fuse_ssm_conv(const ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx, int num_extra);
|
||||
bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx, topk_moe_mode mode);
|
||||
bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx);
|
||||
bool ggml_vk_can_fuse_snake(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx);
|
||||
bool ggml_vk_tensors_overlap(const ggml_tensor * a, const ggml_tensor * b, bool elementwise);
|
||||
bool ggml_vk_can_fuse_rms_norm_mul_rope(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx);
|
||||
uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx);
|
||||
void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * graph);
|
||||
int64_t ggml_vk_get_op_batch_size(const ggml_tensor * op);
|
||||
vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size);
|
||||
ggml_backend_reg_t ggml_backend_vk_reg();
|
||||
bool ggml_vk_instance_layer_settings_available();
|
||||
bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
||||
bool ggml_vk_instance_debug_utils_ext_available(const std::vector<vk::ExtensionProperties> & instance_extensions);
|
||||
bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev);
|
||||
bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch);
|
||||
uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev);
|
||||
|
||||
template <typename T>
|
||||
static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, const T &push_constants, std::array<uint32_t, 3> elements) {
|
||||
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
|
||||
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
|
||||
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
||||
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
||||
for (auto& buffer : descriptor_buffer_infos) {
|
||||
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
|
||||
}
|
||||
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
||||
GGML_ASSERT(wg0 <= ctx->device->properties.limits.maxComputeWorkGroupCount[0] &&
|
||||
wg1 <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] &&
|
||||
wg2 <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
|
||||
GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
|
||||
GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
|
||||
GGML_ASSERT(pipeline->parameter_count == descriptor_buffer_infos.size());
|
||||
GGML_ASSERT(pipeline->push_constant_size == push_constant_size(push_constants));
|
||||
|
||||
vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
|
||||
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
|
||||
ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
|
||||
|
||||
subctx->s->buffer->buf.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants));
|
||||
subctx->s->buffer->buf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
|
||||
subctx->s->buffer->buf.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
|
||||
pipeline->layout,
|
||||
0,
|
||||
{ descriptor_set },
|
||||
{});
|
||||
subctx->s->buffer->buf.dispatch(wg0, wg1, wg2);
|
||||
}
|
||||
|
||||
1608
ggml/src/ggml-vulkan/ggml-vulkan-debug.cpp
Normal file
1608
ggml/src/ggml-vulkan/ggml-vulkan-debug.cpp
Normal file
File diff suppressed because it is too large
Load Diff
293
ggml/src/ggml-vulkan/ggml-vulkan-flash-attn.cpp
Normal file
293
ggml/src/ggml-vulkan/ggml-vulkan-flash-attn.cpp
Normal file
@ -0,0 +1,293 @@
|
||||
#include "ggml-vulkan-common.h"
|
||||
|
||||
void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst) {
|
||||
VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3];
|
||||
std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3];
|
||||
std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3];
|
||||
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
||||
if (sinks) {
|
||||
std::cerr << "), (" << sinks << ", name=" << sinks->name << ", type=" << sinks->type << ", ne0=" << sinks->ne[0] << ", ne1=" << sinks->ne[1] << ", ne2=" << sinks->ne[2] << ", ne3=" << sinks->ne[3] << ", nb0=" << sinks->nb[0] << ", nb1=" << sinks->nb[1] << ", nb2=" << sinks->nb[2] << ", nb3=" << sinks->nb[3];
|
||||
}
|
||||
std::cerr << "))");
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
const uint32_t nem0 = mask ? mask->ne[0] : 0;
|
||||
const uint32_t nem1 = mask ? mask->ne[1] : 0;
|
||||
const uint32_t nem2 = mask ? mask->ne[2] : 0;
|
||||
const uint32_t nem3 = mask ? mask->ne[3] : 0;
|
||||
|
||||
const uint32_t HSK = nek0;
|
||||
const uint32_t HSV = nev0;
|
||||
uint32_t N = neq1;
|
||||
const uint32_t KV = nek1;
|
||||
|
||||
GGML_ASSERT(ne0 == HSV);
|
||||
GGML_ASSERT(ne2 == N);
|
||||
|
||||
// input tensor rows must be contiguous
|
||||
GGML_ASSERT(nbq0 == ggml_type_size(q->type));
|
||||
GGML_ASSERT(nbk0 == ggml_type_size(k->type));
|
||||
GGML_ASSERT(nbv0 == ggml_type_size(v->type));
|
||||
|
||||
GGML_ASSERT(neq0 == HSK);
|
||||
|
||||
GGML_ASSERT(neq1 == N);
|
||||
|
||||
GGML_ASSERT(nev1 == nek1);
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb0 <= nb1);
|
||||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
assert(dst->type == GGML_TYPE_F32);
|
||||
assert(q->type == GGML_TYPE_F32);
|
||||
uint32_t gqa_ratio = 1;
|
||||
uint32_t qk_ratio = neq2 / nek2;
|
||||
uint32_t workgroups_x = (uint32_t)neq1;
|
||||
uint32_t workgroups_y = (uint32_t)neq2;
|
||||
uint32_t workgroups_z = (uint32_t)neq3;
|
||||
|
||||
const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32 || k->type == GGML_TYPE_BF16;
|
||||
|
||||
// For scalar/coopmat1 FA, we can use the "large" size to accommodate qga.
|
||||
// For coopmat2 FA, we always use the small size (which is still pretty large for gqa).
|
||||
vk_fa_tuning_params tuning_params = get_fa_tuning_params(ctx->device, HSK, HSV, 512, KV, k->type, v->type, f32acc);
|
||||
const uint32_t max_gqa = std::min(tuning_params.block_rows, 32u);
|
||||
|
||||
if (N <= 8 && qk_ratio > 1 && qk_ratio <= max_gqa &&
|
||||
qk_ratio * nek2 == neq2 && nek2 == nev2 && nem2 <= 1) {
|
||||
// grouped query attention - make the N dimension equal to gqa_ratio, reduce
|
||||
// workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1
|
||||
// and change addressing calculations to index Q's dimension 2.
|
||||
gqa_ratio = qk_ratio;
|
||||
N = gqa_ratio;
|
||||
workgroups_y /= gqa_ratio;
|
||||
}
|
||||
|
||||
tuning_params = get_fa_tuning_params(ctx->device, HSK, HSV, N, KV, k->type, v->type, f32acc);
|
||||
|
||||
const uint32_t q_stride = (uint32_t)(nbq1 / ggml_type_size(q->type));
|
||||
uint32_t k_stride = (uint32_t)(nbk1 / ggml_type_size(k->type));
|
||||
uint32_t v_stride = (uint32_t)(nbv1 / ggml_type_size(v->type));
|
||||
|
||||
// For F32, the shader treats it as a block of size 4 (for vec4 loads)
|
||||
if (k->type == GGML_TYPE_F32) {
|
||||
k_stride /= 4;
|
||||
}
|
||||
if (v->type == GGML_TYPE_F32) {
|
||||
v_stride /= 4;
|
||||
}
|
||||
|
||||
const uint32_t alignment = tuning_params.block_cols;
|
||||
bool aligned = (KV % alignment) == 0 &&
|
||||
// the "aligned" shader variant will forcibly align strides, for performance
|
||||
(q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0;
|
||||
|
||||
// Need to use the coopmat2 variant that clamps loads when HSK/HSV aren't sufficiently aligned.
|
||||
if (((HSK | HSV) % 16) != 0 && tuning_params.path == FA_COOPMAT2) {
|
||||
aligned = false;
|
||||
}
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
float logit_softcap = 0.0f;
|
||||
|
||||
memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
|
||||
memcpy(&logit_softcap, (const float *) dst->op_params + 2, sizeof(float));
|
||||
|
||||
if (logit_softcap != 0) {
|
||||
scale /= logit_softcap;
|
||||
}
|
||||
|
||||
// Only use mask opt when the mask is fairly large. This hasn't been tuned extensively.
|
||||
bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768 && nem0 >= tuning_params.block_cols * 16;
|
||||
vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(ctx->device, tuning_params, HSK, HSV, aligned, f32acc,
|
||||
mask != nullptr, use_mask_opt, logit_softcap != 0, k->type, v->type);
|
||||
|
||||
vk_pipeline pipeline = nullptr;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(ctx->device->compile_mutex);
|
||||
auto &pipelines = ctx->device->pipeline_flash_attn_f32_f16;
|
||||
auto it = pipelines.find(fa_pipeline_state);
|
||||
if (it != pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
pipelines[fa_pipeline_state] = pipeline = std::make_shared<vk_pipeline_struct>();
|
||||
}
|
||||
}
|
||||
|
||||
assert(pipeline);
|
||||
// Compile early to initialize wg_denoms.
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
|
||||
uint32_t split_kv = KV;
|
||||
uint32_t split_k = 1;
|
||||
|
||||
// Intel Alchemist prefers more workgroups
|
||||
const uint32_t shader_core_count_multiplier = (ctx->device->vendor_id == VK_VENDOR_ID_INTEL && ctx->device->architecture != INTEL_XE2) ? 2 : 1;
|
||||
|
||||
// Use a placeholder core count if one isn't available. split_k is a big help for perf.
|
||||
const uint32_t shader_core_count = ctx->device->shader_core_count ? ctx->device->shader_core_count * shader_core_count_multiplier : 16;
|
||||
|
||||
const uint32_t Br = fa_pipeline_state.Br;
|
||||
const uint32_t Bc = fa_pipeline_state.Bc;
|
||||
|
||||
GGML_ASSERT(Br == pipeline->wg_denoms[0]);
|
||||
const uint32_t Tr = CEIL_DIV(N, Br);
|
||||
|
||||
// Try to use split_k when KV is large enough to be worth the overhead.
|
||||
if (gqa_ratio > 1 && workgroups_x <= Br) {
|
||||
split_k = shader_core_count * 2 / (workgroups_x * workgroups_y * workgroups_z);
|
||||
} else if (gqa_ratio <= 1) {
|
||||
uint32_t total_wgs_no_split = Tr * workgroups_y * workgroups_z;
|
||||
if (total_wgs_no_split < shader_core_count * 2) {
|
||||
split_k = shader_core_count * 2 / total_wgs_no_split;
|
||||
}
|
||||
}
|
||||
|
||||
if (split_k > 1) {
|
||||
// Try to evenly split KV into split_k chunks, but it needs to be a multiple
|
||||
// of "align", so recompute split_k based on that.
|
||||
split_kv = ROUNDUP_POW2(std::max(1u, KV / split_k), alignment);
|
||||
split_k = CEIL_DIV(KV, split_kv);
|
||||
}
|
||||
|
||||
// Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1)
|
||||
// and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows.
|
||||
// For matrices, the order is (inner to outer) [HSV, ne1, k, ne2, ne3].
|
||||
// For L/M, the order is (inner to outer) [ne1, k, ne2, ne3].
|
||||
const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne2 * ne3 : 0;
|
||||
if (split_k_size > ctx->device->properties.limits.maxStorageBufferRange) {
|
||||
GGML_ABORT("Requested preallocation size is too large");
|
||||
}
|
||||
if (ctx->prealloc_size_split_k < split_k_size) {
|
||||
ctx->prealloc_size_split_k = split_k_size;
|
||||
ggml_vk_preallocate_buffers(ctx, subctx);
|
||||
}
|
||||
|
||||
const uint32_t mask_opt_num_dwords = CEIL_DIV(nem0, 16 * Bc);
|
||||
const uint64_t mask_opt_size = sizeof(uint32_t) * mask_opt_num_dwords * CEIL_DIV(nem1, Br) * nem2 * nem3;
|
||||
|
||||
vk_pipeline pipeline_fa_mask_opt = nullptr;
|
||||
if (use_mask_opt) {
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(ctx->device->compile_mutex);
|
||||
auto &pipelines = ctx->device->pipeline_fa_mask_opt;
|
||||
auto it = pipelines.find({Br, Bc});
|
||||
if (it != pipelines.end()) {
|
||||
pipeline_fa_mask_opt = it->second;
|
||||
} else {
|
||||
pipelines[{Br, Bc}] = pipeline_fa_mask_opt = std::make_shared<vk_pipeline_struct>();
|
||||
}
|
||||
}
|
||||
assert(pipeline_fa_mask_opt);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline_fa_mask_opt, 1);
|
||||
|
||||
if (ctx->prealloc_size_y < mask_opt_size) {
|
||||
ctx->prealloc_size_y = mask_opt_size;
|
||||
ggml_vk_preallocate_buffers(ctx, subctx);
|
||||
}
|
||||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t n_head_kv = neq2;
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
vk_subbuffer q_buf = ggml_vk_tensor_subbuffer(ctx, q);
|
||||
vk_subbuffer k_buf = ggml_vk_tensor_subbuffer(ctx, k);
|
||||
vk_subbuffer v_buf = ggml_vk_tensor_subbuffer(ctx, v);
|
||||
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
|
||||
vk_subbuffer mask_buf = mask ? ggml_vk_tensor_subbuffer(ctx, mask) : q_buf;
|
||||
vk_subbuffer sinks_buf = sinks ? ggml_vk_tensor_subbuffer(ctx, sinks) : q_buf;
|
||||
vk_subbuffer mask_opt_buf = use_mask_opt ? ggml_vk_subbuffer(ctx, ctx->prealloc_y, 0) : q_buf;
|
||||
|
||||
uint32_t mask_n_head_log2 = ((sinks != nullptr) << 24) | n_head_log2;
|
||||
|
||||
if (use_mask_opt)
|
||||
{
|
||||
const vk_op_flash_attn_mask_opt_push_constants opt_pc = {
|
||||
nem0,
|
||||
nem1,
|
||||
nem2,
|
||||
(uint32_t)(mask->nb[1] / sizeof(ggml_fp16_t)),
|
||||
(uint32_t)(mask->nb[2] / sizeof(ggml_fp16_t)),
|
||||
(uint32_t)(mask->nb[3] / sizeof(ggml_fp16_t)),
|
||||
mask_opt_num_dwords,
|
||||
mask_opt_num_dwords * CEIL_DIV(nem1, Br),
|
||||
mask_opt_num_dwords * CEIL_DIV(nem1, Br) * nem2,
|
||||
};
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline_fa_mask_opt,
|
||||
{ mask_buf, mask_opt_buf }, opt_pc,
|
||||
{ mask_opt_num_dwords, CEIL_DIV(nem1, Br), nem2 * nem3 });
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
|
||||
const vk_flash_attn_push_constants pc = { N, KV,
|
||||
(uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3,
|
||||
(uint32_t)neq2, (uint32_t)neq3,
|
||||
(uint32_t)nek2, (uint32_t)nek3,
|
||||
(uint32_t)nev2, (uint32_t)nev3,
|
||||
nem1, nem2, nem3,
|
||||
q_stride, (uint32_t)nbq2, (uint32_t)nbq3,
|
||||
k_stride, (uint32_t)nbk2, (uint32_t)nbk3,
|
||||
v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
|
||||
scale, max_bias, logit_softcap,
|
||||
mask_n_head_log2, m0, m1,
|
||||
gqa_ratio, split_kv, split_k };
|
||||
|
||||
if (split_k > 1) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
|
||||
|
||||
if (ctx->prealloc_split_k_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
|
||||
// We reuse workgroups_x to mean the number of splits, so we need to
|
||||
// cancel out the divide by wg_denoms[0].
|
||||
uint32_t dispatch_x;
|
||||
if (gqa_ratio > 1) {
|
||||
workgroups_x *= pipeline->wg_denoms[0];
|
||||
dispatch_x = split_k * workgroups_x;
|
||||
} else {
|
||||
dispatch_x = Tr * split_k * pipeline->wg_denoms[0];
|
||||
}
|
||||
|
||||
vk_subbuffer split_k_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{q_buf, k_buf, v_buf, mask_buf, sinks_buf, split_k_buf, mask_opt_buf},
|
||||
pc, { dispatch_x, workgroups_y, workgroups_z });
|
||||
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
const vk_op_flash_attn_split_k_reduce_push_constants pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, split_k, (sinks != nullptr) };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
|
||||
{split_k_buf, sinks_buf, dst_buf},
|
||||
pc2, { (uint32_t)ne1, HSV, (uint32_t)(ne2 * ne3) });
|
||||
ctx->prealloc_split_k_need_sync = true;
|
||||
} else {
|
||||
if (gqa_ratio > 1) {
|
||||
// When using gqa, we want one actual workgroup per batch, so cancel out wg_denoms
|
||||
workgroups_x *= pipeline->wg_denoms[0];
|
||||
}
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{q_buf, k_buf, v_buf, mask_buf, sinks_buf, dst_buf, mask_opt_buf},
|
||||
pc, { workgroups_x, workgroups_y, workgroups_z });
|
||||
}
|
||||
}
|
||||
|
||||
1610
ggml/src/ggml-vulkan/ggml-vulkan-graph.cpp
Normal file
1610
ggml/src/ggml-vulkan/ggml-vulkan-graph.cpp
Normal file
File diff suppressed because it is too large
Load Diff
1618
ggml/src/ggml-vulkan/ggml-vulkan-instance.cpp
Normal file
1618
ggml/src/ggml-vulkan/ggml-vulkan-instance.cpp
Normal file
File diff suppressed because it is too large
Load Diff
2207
ggml/src/ggml-vulkan/ggml-vulkan-matmul.cpp
Normal file
2207
ggml/src/ggml-vulkan/ggml-vulkan-matmul.cpp
Normal file
File diff suppressed because it is too large
Load Diff
2893
ggml/src/ggml-vulkan/ggml-vulkan-operators.cpp
Normal file
2893
ggml/src/ggml-vulkan/ggml-vulkan-operators.cpp
Normal file
File diff suppressed because it is too large
Load Diff
512
ggml/src/ggml-vulkan/ggml-vulkan-pipelines.cpp
Normal file
512
ggml/src/ggml-vulkan/ggml-vulkan-pipelines.cpp
Normal file
@ -0,0 +1,512 @@
|
||||
#include "ggml-vulkan-common.h"
|
||||
|
||||
std::mutex queue_mutex;
|
||||
|
||||
void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
|
||||
|
||||
uint64_t vk_tensor_offset(const ggml_tensor * tensor) {
|
||||
if (tensor->view_src) {
|
||||
return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base;
|
||||
}
|
||||
return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
||||
}
|
||||
|
||||
uint32_t get_misalign_bytes(const ggml_backend_vk_context * ctx, const ggml_tensor * t)
|
||||
{
|
||||
return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));;
|
||||
}
|
||||
|
||||
static VkDeviceSize ggml_vk_get_max_buffer_range(const ggml_backend_vk_context * ctx, const vk_buffer &buf, const VkDeviceSize offset) {
|
||||
const VkDeviceSize range = std::min(VkDeviceSize{buf->size - offset},
|
||||
VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange});
|
||||
return range;
|
||||
}
|
||||
|
||||
void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) {
|
||||
// Use waitForFences while most of the graph executes. Hopefully the CPU can sleep
|
||||
// during this wait.
|
||||
if (ctx->almost_ready_fence_pending) {
|
||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->almost_ready_fence }, true, UINT64_MAX), "almost_ready_fence");
|
||||
ctx->device->device.resetFences({ ctx->almost_ready_fence });
|
||||
ctx->almost_ready_fence_pending = false;
|
||||
}
|
||||
|
||||
// Spin (w/pause) waiting for the graph to finish executing.
|
||||
vk::Result result;
|
||||
while ((result = ctx->device->device.getFenceStatus(ctx->fence)) != vk::Result::eSuccess) {
|
||||
if (result != vk::Result::eNotReady) {
|
||||
fprintf(stderr, "ggml_vulkan: error %s at %s:%d\n", to_string(result).c_str(), __FILE__, __LINE__);
|
||||
exit(1);
|
||||
}
|
||||
for (uint32_t i = 0; i < 100; ++i) {
|
||||
YIELD();
|
||||
YIELD();
|
||||
YIELD();
|
||||
YIELD();
|
||||
YIELD();
|
||||
YIELD();
|
||||
YIELD();
|
||||
YIELD();
|
||||
YIELD();
|
||||
YIELD();
|
||||
}
|
||||
}
|
||||
ctx->device->device.resetFences({ ctx->fence });
|
||||
}
|
||||
|
||||
void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) {
|
||||
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
||||
ctx->pipeline_descriptor_set_requirements += n;
|
||||
if (!pipeline->compiled) {
|
||||
ggml_vk_load_shaders(ctx->device, pipeline);
|
||||
}
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx);
|
||||
}
|
||||
|
||||
void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) {
|
||||
|
||||
if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) {
|
||||
// Enough descriptors are available
|
||||
return;
|
||||
}
|
||||
|
||||
vk_device& device = ctx->device;
|
||||
|
||||
// Grow by 50% to avoid frequent allocations
|
||||
uint32_t needed = std::max(3 * ctx->descriptor_sets.size() / 2, size_t{ctx->pipeline_descriptor_set_requirements});
|
||||
uint32_t to_alloc = needed - ctx->descriptor_sets.size();
|
||||
uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
||||
uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
||||
|
||||
while (to_alloc > 0) {
|
||||
const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
|
||||
to_alloc -= alloc_count;
|
||||
pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
||||
|
||||
if (pool_idx >= ctx->descriptor_pools.size()) {
|
||||
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
|
||||
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
|
||||
ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
||||
}
|
||||
|
||||
std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
|
||||
for (uint32_t i = 0; i < alloc_count; i++) {
|
||||
layouts[i] = device->dsl;
|
||||
}
|
||||
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data());
|
||||
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
||||
ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end());
|
||||
|
||||
pool_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
static vk_command_buffer* ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
|
||||
VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
|
||||
vk::CommandBufferAllocateInfo command_buffer_alloc_info(
|
||||
p.pool,
|
||||
vk::CommandBufferLevel::ePrimary,
|
||||
1);
|
||||
const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
|
||||
p.cmd_buffers.push_back({ cmd_buffers.front(), 0, true });
|
||||
return &p.cmd_buffers[p.cmd_buffers.size()-1];
|
||||
}
|
||||
|
||||
void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
||||
if (ctx->seqs.empty()) {
|
||||
if (fence) {
|
||||
std::lock_guard<std::mutex> guard(queue_mutex);
|
||||
ctx->p->q->queue.submit({}, fence);
|
||||
}
|
||||
return;
|
||||
}
|
||||
VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
|
||||
|
||||
std::vector<std::vector<uint64_t>> tl_wait_vals;
|
||||
std::vector<std::vector<uint64_t>> tl_signal_vals;
|
||||
std::vector<std::vector<vk::Semaphore>> tl_wait_semaphores;
|
||||
std::vector<std::vector<vk::Semaphore>> tl_signal_semaphores;
|
||||
std::vector<vk::TimelineSemaphoreSubmitInfo> tl_submit_infos;
|
||||
std::vector<vk::SubmitInfo> submit_infos;
|
||||
int idx = -1;
|
||||
std::vector<std::vector<vk::PipelineStageFlags>> stage_flags;
|
||||
|
||||
size_t reserve = 0;
|
||||
|
||||
for (const auto& sequence : ctx->seqs) {
|
||||
reserve += sequence.size();
|
||||
}
|
||||
|
||||
// Pre-reserve vectors to prevent reallocation, which invalidates pointers
|
||||
tl_wait_semaphores.reserve(reserve);
|
||||
tl_wait_vals.reserve(reserve);
|
||||
tl_signal_semaphores.reserve(reserve);
|
||||
tl_signal_vals.reserve(reserve);
|
||||
tl_submit_infos.reserve(reserve);
|
||||
submit_infos.reserve(reserve);
|
||||
stage_flags.reserve(reserve);
|
||||
|
||||
for (const auto& sequence : ctx->seqs) {
|
||||
for (const auto& submission : sequence) {
|
||||
stage_flags.push_back({});
|
||||
idx++;
|
||||
tl_wait_vals.push_back({});
|
||||
tl_wait_semaphores.push_back({});
|
||||
tl_signal_vals.push_back({});
|
||||
tl_signal_semaphores.push_back({});
|
||||
for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
|
||||
stage_flags[idx].push_back(ctx->p->q->stage_flags);
|
||||
tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
|
||||
tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
|
||||
}
|
||||
for (size_t i = 0; i < submission.signal_semaphores.size(); i++) {
|
||||
tl_signal_vals[idx].push_back(submission.signal_semaphores[i].value);
|
||||
tl_signal_semaphores[idx].push_back(submission.signal_semaphores[i].s);
|
||||
}
|
||||
tl_submit_infos.push_back({
|
||||
(uint32_t) submission.wait_semaphores.size(),
|
||||
tl_wait_vals[idx].data(),
|
||||
(uint32_t) submission.signal_semaphores.size(),
|
||||
tl_signal_vals[idx].data(),
|
||||
});
|
||||
tl_submit_infos[idx].sType = vk::StructureType::eTimelineSemaphoreSubmitInfo;
|
||||
tl_submit_infos[idx].pNext = nullptr;
|
||||
vk::SubmitInfo si{
|
||||
(uint32_t) submission.wait_semaphores.size(),
|
||||
tl_wait_semaphores[idx].data(),
|
||||
stage_flags[idx].data(),
|
||||
1,
|
||||
&submission.buffer->buf,
|
||||
(uint32_t) submission.signal_semaphores.size(),
|
||||
tl_signal_semaphores[idx].data(),
|
||||
};
|
||||
si.setPNext(&tl_submit_infos[idx]);
|
||||
submit_infos.push_back(si);
|
||||
}
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> guard(queue_mutex);
|
||||
ctx->p->q->queue.submit(submit_infos, fence);
|
||||
|
||||
ctx->seqs.clear();
|
||||
}
|
||||
|
||||
uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
|
||||
VK_LOG_DEBUG("ggml_vk_find_queue_family_index()");
|
||||
const uint32_t qfsize = queue_family_props.size();
|
||||
|
||||
// Try with avoid preferences first
|
||||
for (uint32_t i = 0; i < qfsize; i++) {
|
||||
if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required && !(queue_family_props[i].queueFlags & avoid)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to only required
|
||||
for (size_t i = 0; i < qfsize; i++) {
|
||||
if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to reusing compute queue
|
||||
for (size_t i = 0; i < qfsize; i++) {
|
||||
if (queue_family_props[i].queueCount >= min_num_queues && queue_family_props[i].queueFlags & required) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to ignoring min_num_queries
|
||||
for (size_t i = 0; i < qfsize; i++) {
|
||||
if (queue_family_props[i].queueFlags & required) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
// All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations.
|
||||
// Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional.
|
||||
if (compute_index >= 0) {
|
||||
return compute_index;
|
||||
}
|
||||
|
||||
std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
|
||||
|
||||
for(auto &q_family : queue_family_props) {
|
||||
std::cerr << "Queue number: " + std::to_string(q_family.queueCount) << " flags: " + to_string(q_family.queueFlags) << std::endl;
|
||||
}
|
||||
abort();
|
||||
}
|
||||
|
||||
void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags, bool transfer_only) {
|
||||
VK_LOG_DEBUG("ggml_vk_create_queue()");
|
||||
std::lock_guard<std::recursive_mutex> guard(device->mutex);
|
||||
|
||||
q.queue_family_index = queue_family_index;
|
||||
q.transfer_only = transfer_only;
|
||||
|
||||
q.cmd_pool.init(device, &q);
|
||||
|
||||
q.queue = device->device.getQueue(queue_family_index, queue_index);
|
||||
|
||||
q.stage_flags = stage_flags;
|
||||
}
|
||||
|
||||
vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) {
|
||||
vk_context result = std::make_shared<vk_context_struct>();
|
||||
VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
|
||||
ctx->gc.contexts.emplace_back(result);
|
||||
result->p = &p;
|
||||
return result;
|
||||
}
|
||||
|
||||
vk_context ggml_vk_create_temporary_context(vk_command_pool& p) {
|
||||
vk_context result = std::make_shared<vk_context_struct>();
|
||||
VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
|
||||
result->p = &p;
|
||||
return result;
|
||||
}
|
||||
|
||||
static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
|
||||
VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
|
||||
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
|
||||
vk::SemaphoreCreateInfo ci{};
|
||||
ci.setPNext(&tci);
|
||||
vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci);
|
||||
ctx->gc.semaphores.push_back({ semaphore, 0 });
|
||||
return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1];
|
||||
}
|
||||
|
||||
static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
|
||||
VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
|
||||
if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) {
|
||||
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
|
||||
vk::SemaphoreCreateInfo ci{};
|
||||
ci.setPNext(&tci);
|
||||
vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci);
|
||||
ctx->gc.tl_semaphores.push_back({ semaphore, 0 });
|
||||
}
|
||||
return &ctx->gc.tl_semaphores[ctx->semaphore_idx++];
|
||||
}
|
||||
|
||||
static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
|
||||
if (ctx->event_idx >= ctx->gc.events.size()) {
|
||||
ctx->gc.events.push_back(ctx->device->device.createEvent({}));
|
||||
}
|
||||
return ctx->gc.events[ctx->event_idx++];
|
||||
}
|
||||
|
||||
void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) {
|
||||
VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()");
|
||||
|
||||
// Requires command buffers to be done
|
||||
device->device.resetCommandPool(p.pool);
|
||||
// Don't clear the command buffers and mark them as not in use.
|
||||
// This allows us to reuse them
|
||||
for (auto& cmd_buffer : p.cmd_buffers) {
|
||||
cmd_buffer.in_use = false;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
|
||||
VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()");
|
||||
|
||||
// Arbitrary frequency to cleanup/reuse command buffers
|
||||
static constexpr uint32_t cleanup_frequency = 10;
|
||||
|
||||
if (device->compute_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) {
|
||||
ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool);
|
||||
}
|
||||
if (device->transfer_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) {
|
||||
ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool);
|
||||
}
|
||||
}
|
||||
|
||||
vk_subbuffer ggml_vk_subbuffer(const ggml_backend_vk_context* ctx, const vk_buffer& buf, size_t offset) {
|
||||
return { buf, offset, ggml_vk_get_max_buffer_range(ctx, buf, offset) };
|
||||
}
|
||||
|
||||
void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx) {
|
||||
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
||||
|
||||
const bool transfer_queue = subctx->p->q->transfer_only;
|
||||
|
||||
if (ctx) {
|
||||
ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false;
|
||||
}
|
||||
|
||||
subctx->s->buffer->buf.pipelineBarrier(
|
||||
subctx->p->q->stage_flags,
|
||||
subctx->p->q->stage_flags,
|
||||
{},
|
||||
{ {
|
||||
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
|
||||
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) }
|
||||
} },
|
||||
{},
|
||||
{}
|
||||
);
|
||||
}
|
||||
|
||||
static void ggml_vk_reset_event(vk_context& ctx, vk::Event& event) {
|
||||
VK_LOG_DEBUG("ggml_vk_set_event()");
|
||||
|
||||
ctx->s->buffer->buf.resetEvent(
|
||||
event,
|
||||
ctx->p->q->stage_flags
|
||||
);
|
||||
}
|
||||
|
||||
void ggml_vk_set_event(vk_context& ctx, vk::Event& event) {
|
||||
VK_LOG_DEBUG("ggml_vk_set_event()");
|
||||
|
||||
ctx->s->buffer->buf.setEvent(
|
||||
event,
|
||||
ctx->p->q->stage_flags
|
||||
);
|
||||
}
|
||||
|
||||
void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events) {
|
||||
VK_LOG_DEBUG("ggml_vk_wait_events()");
|
||||
if (events.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
ctx->s->buffer->buf.waitEvents(
|
||||
events,
|
||||
ctx->p->q->stage_flags,
|
||||
ctx->p->q->stage_flags,
|
||||
{},
|
||||
{},
|
||||
{}
|
||||
);
|
||||
}
|
||||
|
||||
vk_subbuffer ggml_vk_tensor_subbuffer(
|
||||
const ggml_backend_vk_context * ctx, const ggml_tensor * tensor, bool allow_misalign) {
|
||||
|
||||
vk_buffer buffer = nullptr;
|
||||
size_t offset = 0;
|
||||
if (ctx->device->uma) {
|
||||
ggml_vk_host_get(ctx->device, tensor->data, buffer, offset);
|
||||
}
|
||||
if (!buffer) {
|
||||
auto buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
|
||||
buffer = buf_ctx->dev_buffer;
|
||||
offset = vk_tensor_offset(tensor) + tensor->view_offs;
|
||||
}
|
||||
GGML_ASSERT(buffer != nullptr);
|
||||
|
||||
size_t size = ggml_nbytes(tensor);
|
||||
|
||||
size_t misalign_bytes = offset & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||
// The shader must support misaligned offsets when indexing into the buffer
|
||||
GGML_ASSERT(allow_misalign || misalign_bytes == 0);
|
||||
offset &= ~misalign_bytes;
|
||||
size += misalign_bytes;
|
||||
|
||||
return vk_subbuffer{buffer, offset, size};
|
||||
}
|
||||
|
||||
static vk_command_buffer* ggml_vk_get_or_create_cmd_buffer(vk_device& device, vk_command_pool& pool) {
|
||||
for (auto& cmd_buffer : pool.cmd_buffers) {
|
||||
if (!cmd_buffer.in_use) {
|
||||
cmd_buffer.use_counter++;
|
||||
cmd_buffer.in_use = true;
|
||||
return &cmd_buffer;
|
||||
}
|
||||
}
|
||||
return ggml_vk_create_cmd_buffer(device, pool);
|
||||
}
|
||||
|
||||
static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
|
||||
vk_submission s;
|
||||
s.buffer = ggml_vk_get_or_create_cmd_buffer(device, p);
|
||||
if (one_time) {
|
||||
s.buffer->buf.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
||||
} else {
|
||||
s.buffer->buf.begin({ vk::CommandBufferUsageFlags{} });
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
void ggml_vk_ctx_end(vk_context& ctx) {
|
||||
VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
|
||||
if (ctx->s == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
ctx->s->buffer->buf.end();
|
||||
ctx->s = nullptr;
|
||||
}
|
||||
|
||||
void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
|
||||
VK_LOG_DEBUG("ggml_vk_ctx_begin(" << device->name << ")");
|
||||
if (subctx->s != nullptr) {
|
||||
ggml_vk_ctx_end(subctx);
|
||||
}
|
||||
|
||||
subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) });
|
||||
subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
|
||||
}
|
||||
|
||||
vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx) {
|
||||
vk_context result;
|
||||
if (!ctx->compute_ctx.expired()) {
|
||||
result = ctx->compute_ctx.lock();
|
||||
} else {
|
||||
result = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
|
||||
ctx->compute_ctx = result;
|
||||
ggml_vk_ctx_begin(ctx->device, result);
|
||||
}
|
||||
|
||||
if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) {
|
||||
result->s->wait_semaphores.push_back(ctx->transfer_semaphore);
|
||||
ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
bool ggml_vk_submit_transfer_ctx(ggml_backend_vk_context * ctx) {
|
||||
if (!ctx->device->async_use_transfer_queue || ctx->transfer_ctx.expired()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
vk_context cpy_ctx = ctx->transfer_ctx.lock();
|
||||
ggml_vk_ctx_end(cpy_ctx);
|
||||
|
||||
for (auto& cpy : cpy_ctx->in_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
|
||||
ctx->transfer_semaphore.value++;
|
||||
cpy_ctx->seqs.back().back().signal_semaphores.push_back(ctx->transfer_semaphore);
|
||||
|
||||
ggml_vk_submit(cpy_ctx, {});
|
||||
ctx->transfer_ctx.reset();
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t ggml_vk_align_size(size_t width, size_t align) {
|
||||
VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")");
|
||||
return CEIL_DIV(width, align) * align;
|
||||
}
|
||||
|
||||
void deferred_memcpy(void * dst, const void * src, size_t size, std::vector<vk_staging_memcpy>* memcpys) {
|
||||
if (memcpys == nullptr) {
|
||||
memcpy(dst, src, size);
|
||||
} else {
|
||||
memcpys->emplace_back(dst, src, size);
|
||||
}
|
||||
}
|
||||
|
||||
void deferred_memset(void * dst, uint32_t val, size_t size, std::vector<vk_staging_memset>* memsets) {
|
||||
if (memsets == nullptr) {
|
||||
memset(dst, val, size);
|
||||
} else {
|
||||
memsets->emplace_back(dst, val, size);
|
||||
}
|
||||
}
|
||||
|
||||
872
ggml/src/ggml-vulkan/ggml-vulkan-push-constants.h
Normal file
872
ggml/src/ggml-vulkan/ggml-vulkan-push-constants.h
Normal file
@ -0,0 +1,872 @@
|
||||
#pragma once
|
||||
#include "ggml-vulkan-types.h"
|
||||
|
||||
uint32_t get_misalign_bytes(const ggml_backend_vk_context * ctx, const ggml_tensor * t);
|
||||
|
||||
struct vk_mat_mat_push_constants {
|
||||
uint32_t M; uint32_t N; uint32_t K;
|
||||
uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
||||
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
|
||||
uint32_t base_work_group_z; uint32_t num_batches;
|
||||
uint32_t k_split;
|
||||
uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
|
||||
uint32_t padded_N;
|
||||
};
|
||||
|
||||
struct vk_mat_vec_push_constants {
|
||||
uint32_t ncols;
|
||||
uint32_t stride_a;
|
||||
uint32_t stride_b;
|
||||
uint32_t stride_d;
|
||||
uint32_t batch_stride_a;
|
||||
uint32_t batch_stride_b;
|
||||
uint32_t batch_stride_d;
|
||||
uint32_t fusion_flags;
|
||||
uint32_t base_work_group_y;
|
||||
uint32_t ne02;
|
||||
uint32_t ne12;
|
||||
uint32_t broadcast2;
|
||||
uint32_t broadcast3;
|
||||
};
|
||||
|
||||
struct vk_mat_vec_p021_push_constants {
|
||||
uint32_t ncols_x;
|
||||
uint32_t nrows_x;
|
||||
uint32_t nchannels_x;
|
||||
uint32_t nchannels_y;
|
||||
uint32_t b_offset;
|
||||
uint32_t d_offset;
|
||||
uint32_t fusion_flags;
|
||||
};
|
||||
|
||||
struct vk_mat_vec_nc_push_constants {
|
||||
uint32_t ncols_x;
|
||||
uint32_t nrows_x;
|
||||
uint32_t row_stride_x;
|
||||
uint32_t channel_stride_x;
|
||||
uint32_t channel_stride_y;
|
||||
uint32_t channel_x_divisor;
|
||||
uint32_t ne12;
|
||||
uint32_t b_offset;
|
||||
uint32_t d_offset;
|
||||
uint32_t nb03;
|
||||
uint32_t nb13;
|
||||
uint32_t nb23;
|
||||
uint32_t fusion_flags;
|
||||
};
|
||||
|
||||
struct vk_mat_mat_id_push_constants {
|
||||
uint32_t M; uint32_t N; uint32_t K;
|
||||
uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
||||
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
|
||||
uint32_t nei0; uint32_t nei1; uint32_t nbi1; uint32_t ne11;
|
||||
uint32_t padded_N;
|
||||
};
|
||||
|
||||
struct vk_mat_vec_id_push_constants {
|
||||
uint32_t ncols;
|
||||
uint32_t stride_a;
|
||||
uint32_t stride_b;
|
||||
uint32_t stride_d;
|
||||
uint32_t batch_stride_a;
|
||||
uint32_t batch_stride_b;
|
||||
uint32_t batch_stride_d;
|
||||
uint32_t fusion_flags;
|
||||
uint32_t nei0;
|
||||
uint32_t ne11;
|
||||
uint32_t expert_i1;
|
||||
uint32_t nbi1;
|
||||
};
|
||||
|
||||
struct vk_flash_attn_push_constants {
|
||||
uint32_t N;
|
||||
uint32_t KV;
|
||||
|
||||
uint32_t ne1;
|
||||
uint32_t ne2;
|
||||
uint32_t ne3;
|
||||
|
||||
uint32_t neq2;
|
||||
uint32_t neq3;
|
||||
uint32_t nek2;
|
||||
uint32_t nek3;
|
||||
uint32_t nev2;
|
||||
uint32_t nev3;
|
||||
uint32_t nem1;
|
||||
uint32_t nem2;
|
||||
uint32_t nem3;
|
||||
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb03;
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
uint32_t nb21;
|
||||
uint32_t nb22;
|
||||
uint32_t nb23;
|
||||
|
||||
float scale;
|
||||
float max_bias;
|
||||
float logit_softcap;
|
||||
|
||||
uint32_t mask_n_head_log2;
|
||||
float m0;
|
||||
float m1;
|
||||
|
||||
uint32_t gqa_ratio;
|
||||
uint32_t split_kv;
|
||||
uint32_t k_num;
|
||||
};
|
||||
|
||||
static_assert(sizeof(vk_flash_attn_push_constants) <= 128, "sizeof(vk_flash_attn_push_constants) must be <= 128");
|
||||
|
||||
struct vk_op_push_constants {
|
||||
uint32_t KX;
|
||||
uint32_t KY;
|
||||
float param1;
|
||||
float param2;
|
||||
float param3;
|
||||
float param4;
|
||||
};
|
||||
|
||||
struct vk_op_fwht_push_constants {
|
||||
uint32_t n_rows;
|
||||
uint32_t src_offset;
|
||||
uint32_t dst_offset;
|
||||
float scale;
|
||||
};
|
||||
|
||||
struct vk_op_count_experts_push_constants {
|
||||
uint32_t ne00;
|
||||
uint32_t ne01;
|
||||
uint32_t nb00;
|
||||
uint32_t nb01;
|
||||
uint32_t a_offset;
|
||||
};
|
||||
|
||||
struct vk_op_glu_push_constants {
|
||||
uint32_t N;
|
||||
uint32_t ne00;
|
||||
uint32_t ne20;
|
||||
uint32_t mode; // 0: default, 1: swapped, 2: split
|
||||
float alpha; // for swiglu_oai
|
||||
float limit;
|
||||
uint32_t nb00;
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb03;
|
||||
uint32_t nb10;
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
uint32_t nb20;
|
||||
uint32_t nb21;
|
||||
uint32_t nb22;
|
||||
uint32_t nb23;
|
||||
uint32_t ne21;
|
||||
uint32_t ne22;
|
||||
uint32_t misalign_offsets;
|
||||
uint32_t ne2_012mp; uint32_t ne2_012L;
|
||||
uint32_t ne2_01mp; uint32_t ne2_01L;
|
||||
uint32_t ne2_0mp; uint32_t ne2_0L;
|
||||
};
|
||||
|
||||
static_assert(sizeof(vk_op_glu_push_constants) <= 128, "sizeof(vk_op_glu_push_constants) must be <= 128");
|
||||
|
||||
struct vk_op_unary_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
||||
uint32_t misalign_offsets;
|
||||
float param1; float param2; float param3; float param4;
|
||||
uint32_t ne0_012mp; uint32_t ne0_01mp; uint32_t ne0_0mp; uint32_t ne0_Ls;
|
||||
uint32_t ne1_012mp; uint32_t ne1_01mp; uint32_t ne1_0mp; uint32_t ne1_Ls;
|
||||
};
|
||||
|
||||
static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");
|
||||
|
||||
static vk_op_unary_push_constants vk_op_unary_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst, int64_t ne = 0) {
|
||||
GGML_ASSERT(ne != 0 || (ggml_nelements(src0) == ggml_nelements(dst)));
|
||||
ne = ne != 0 ? ne : ggml_nelements(dst);
|
||||
GGML_ASSERT(ne <= (int64_t)std::numeric_limits<uint32_t>::max());
|
||||
|
||||
vk_op_unary_push_constants p{};
|
||||
p.ne = (uint32_t)ne;
|
||||
|
||||
size_t src0_tsize = ggml_type_size(src0->type);
|
||||
p.ne00 = (uint32_t)src0->ne[0];
|
||||
p.ne01 = (uint32_t)src0->ne[1];
|
||||
p.ne02 = (uint32_t)src0->ne[2];
|
||||
p.ne03 = (uint32_t)src0->ne[3];
|
||||
p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize);
|
||||
p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize);
|
||||
p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize);
|
||||
p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize);
|
||||
|
||||
size_t dst_tsize = ggml_type_size(dst->type);
|
||||
p.ne10 = (uint32_t)dst->ne[0];
|
||||
p.ne11 = (uint32_t)dst->ne[1];
|
||||
p.ne12 = (uint32_t)dst->ne[2];
|
||||
p.ne13 = (uint32_t)dst->ne[3];
|
||||
p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize);
|
||||
p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize);
|
||||
p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize);
|
||||
p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize);
|
||||
|
||||
return p; // offsets are initialized later in ggml_vk_op
|
||||
}
|
||||
|
||||
struct vk_op_pad_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
||||
uint32_t misalign_offsets;
|
||||
uint32_t circular;
|
||||
|
||||
uint32_t lp0; uint32_t rp0;
|
||||
uint32_t lp1; uint32_t rp1;
|
||||
uint32_t lp2; uint32_t rp2;
|
||||
uint32_t lp3; uint32_t rp3;
|
||||
};
|
||||
|
||||
static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst) {
|
||||
int64_t ne = ggml_nelements(dst);
|
||||
GGML_ASSERT(ne <= (int64_t)std::numeric_limits<uint32_t>::max());
|
||||
|
||||
vk_op_pad_push_constants p{};
|
||||
p.ne = (uint32_t)ne;
|
||||
|
||||
size_t src0_tsize = ggml_type_size(src0->type);
|
||||
p.ne00 = (uint32_t)src0->ne[0];
|
||||
p.ne01 = (uint32_t)src0->ne[1];
|
||||
p.ne02 = (uint32_t)src0->ne[2];
|
||||
p.ne03 = (uint32_t)src0->ne[3];
|
||||
p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize);
|
||||
p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize);
|
||||
p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize);
|
||||
p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize);
|
||||
|
||||
size_t dst_tsize = ggml_type_size(dst->type);
|
||||
p.ne10 = (uint32_t)dst->ne[0];
|
||||
p.ne11 = (uint32_t)dst->ne[1];
|
||||
p.ne12 = (uint32_t)dst->ne[2];
|
||||
p.ne13 = (uint32_t)dst->ne[3];
|
||||
p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize);
|
||||
p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize);
|
||||
p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize);
|
||||
p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize);
|
||||
|
||||
p.lp0 = dst->op_params[0];
|
||||
p.rp0 = dst->op_params[1];
|
||||
p.lp1 = dst->op_params[2];
|
||||
p.rp1 = dst->op_params[3];
|
||||
p.lp2 = dst->op_params[4];
|
||||
p.rp2 = dst->op_params[5];
|
||||
p.lp3 = dst->op_params[6];
|
||||
p.rp3 = dst->op_params[7];
|
||||
p.circular = dst->op_params[8];
|
||||
|
||||
return p; // fastdiv values and offsets are initialized later in ggml_vk_op
|
||||
}
|
||||
|
||||
static void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L)
|
||||
{
|
||||
// compute L = ceil(log2(d));
|
||||
L = 0;
|
||||
while (L < 32 && (uint32_t{1} << L) < d) {
|
||||
L++;
|
||||
}
|
||||
|
||||
mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1);
|
||||
}
|
||||
|
||||
static uint32_t pack_fastdiv_L(uint32_t L0, uint32_t L1, uint32_t L2) {
|
||||
return L0 | (L1 << 8) | (L2 << 16);
|
||||
}
|
||||
|
||||
template <typename T> void init_pushconst_fastdiv(T &p) {
|
||||
GGML_UNUSED(p);
|
||||
static_assert(!std::is_const<T>::value, "unexpected type");
|
||||
}
|
||||
|
||||
template <> inline void init_pushconst_fastdiv(vk_op_unary_push_constants &p) {
|
||||
// Compute magic values to divide by these six numbers.
|
||||
uint32_t ne0_012L;
|
||||
uint32_t ne0_01L;
|
||||
uint32_t ne0_0L;
|
||||
uint32_t ne1_012L;
|
||||
uint32_t ne1_01L;
|
||||
uint32_t ne1_0L;
|
||||
|
||||
init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, ne0_012L);
|
||||
init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, ne0_01L);
|
||||
init_fastdiv_values(p.ne00, p.ne0_0mp, ne0_0L);
|
||||
init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, ne1_012L);
|
||||
init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, ne1_01L);
|
||||
init_fastdiv_values(p.ne10, p.ne1_0mp, ne1_0L);
|
||||
|
||||
p.ne0_Ls = pack_fastdiv_L(ne0_012L, ne0_01L, ne0_0L);
|
||||
p.ne1_Ls = pack_fastdiv_L(ne1_012L, ne1_01L, ne1_0L);
|
||||
}
|
||||
|
||||
template <> inline void init_pushconst_fastdiv(vk_op_glu_push_constants &p) {
|
||||
// GLU linearizes over dst, then uses dst coordinates for src0/src1.
|
||||
init_fastdiv_values(p.ne22*p.ne21*p.ne20, p.ne2_012mp, p.ne2_012L);
|
||||
init_fastdiv_values(p.ne21*p.ne20, p.ne2_01mp, p.ne2_01L);
|
||||
init_fastdiv_values(p.ne20, p.ne2_0mp, p.ne2_0L);
|
||||
}
|
||||
|
||||
struct vk_op_binary_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
||||
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
|
||||
uint32_t misalign_offsets;
|
||||
float param1; float param2; int32_t param3;
|
||||
};
|
||||
|
||||
struct vk_op_multi_add_push_constants {
|
||||
// shape for dst
|
||||
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23;
|
||||
|
||||
// strides for srcs+dst
|
||||
uint32_t nb[MAX_PARAMETER_COUNT][4];
|
||||
|
||||
uint32_t rms_partials;
|
||||
};
|
||||
|
||||
static_assert(MAX_PARAMETER_COUNT == 12);
|
||||
|
||||
static_assert(sizeof(vk_op_multi_add_push_constants) <= 256);
|
||||
|
||||
struct vk_op_topk_moe_push_constants {
|
||||
uint32_t n_rows;
|
||||
uint32_t n_experts_push;
|
||||
uint32_t n_expert_used;
|
||||
float clamp_min;
|
||||
float clamp_max;
|
||||
uint32_t gating_func;
|
||||
uint32_t has_bias;
|
||||
uint32_t with_norm;
|
||||
float output_scale;
|
||||
float output_bias;
|
||||
};
|
||||
|
||||
struct vk_op_add_id_push_constants {
|
||||
uint32_t ne0;
|
||||
uint32_t ne1;
|
||||
uint32_t s01;
|
||||
uint32_t s02;
|
||||
uint32_t s11;
|
||||
uint32_t s21;
|
||||
};
|
||||
|
||||
struct vk_op_diag_mask_push_constants {
|
||||
uint32_t ncols;
|
||||
uint32_t rows_per_channel;
|
||||
int32_t n_past;
|
||||
};
|
||||
|
||||
struct vk_op_rope_push_constants {
|
||||
uint32_t rope_mode;
|
||||
uint32_t nrows;
|
||||
uint32_t n_dims;
|
||||
float freq_scale;
|
||||
float freq_base;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float corr_dims[2];
|
||||
float theta_scale;
|
||||
uint32_t has_ff;
|
||||
int32_t sections[4];
|
||||
uint32_t is_imrope;
|
||||
uint32_t is_back;
|
||||
uint32_t set_rows_stride;
|
||||
uint32_t ne00;
|
||||
uint32_t ne01;
|
||||
uint32_t ne02;
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb03;
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
uint32_t a_offset;
|
||||
uint32_t d_offset;
|
||||
};
|
||||
|
||||
static_assert(sizeof(vk_op_rope_push_constants) <= 128, "sizeof(vk_op_rope_push_constants) must be <= 128");
|
||||
|
||||
struct vk_op_rms_norm_mul_rope_push_constants {
|
||||
vk_op_binary_push_constants bin;
|
||||
vk_op_rope_push_constants rope;
|
||||
};
|
||||
|
||||
struct vk_op_soft_max_push_constants {
|
||||
uint32_t KX;
|
||||
uint32_t KY;
|
||||
uint32_t ne00;
|
||||
uint32_t ne01;
|
||||
uint32_t ne02;
|
||||
uint32_t ne12;
|
||||
uint32_t ne13;
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
float scale;
|
||||
float max_bias;
|
||||
float m0;
|
||||
float m1;
|
||||
uint32_t n_head_log2;
|
||||
uint32_t nrows_x;
|
||||
uint32_t has_sinks;
|
||||
};
|
||||
|
||||
struct vk_op_argsort_push_constants {
|
||||
uint32_t ncols;
|
||||
uint32_t ncols_padded;
|
||||
uint32_t ncols_padded_log2;
|
||||
uint32_t nrows;
|
||||
uint32_t order;
|
||||
uint32_t outer_start;
|
||||
uint32_t outer_end;
|
||||
uint32_t inner_start;
|
||||
uint32_t inner_end;
|
||||
};
|
||||
|
||||
struct vk_op_topk_push_constants {
|
||||
uint32_t orig_ncols;
|
||||
uint32_t ncols_input;
|
||||
uint32_t ncols_output;
|
||||
uint32_t k;
|
||||
uint32_t nrows;
|
||||
uint32_t first_pass;
|
||||
uint32_t last_pass;
|
||||
};
|
||||
|
||||
struct vk_op_im2col_push_constants {
|
||||
uint64_t dst_addr;
|
||||
uint32_t batch_offset; uint32_t offset_delta;
|
||||
uint32_t IC;
|
||||
uint32_t IW; uint32_t IH;
|
||||
uint32_t OW; uint32_t OH;
|
||||
uint32_t KW; uint32_t KH;
|
||||
uint32_t OH_batch;
|
||||
uint32_t CHW;
|
||||
int32_t s0; int32_t s1;
|
||||
int32_t p0; int32_t p1;
|
||||
int32_t d0; int32_t d1;
|
||||
uint32_t batch_IC;
|
||||
};
|
||||
|
||||
struct vk_op_im2col_3d_push_constants {
|
||||
uint64_t dst_addr;
|
||||
uint32_t nb10;
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
uint32_t s0;
|
||||
uint32_t s1;
|
||||
uint32_t s2;
|
||||
uint32_t p0;
|
||||
uint32_t p1;
|
||||
uint32_t p2;
|
||||
uint32_t d0;
|
||||
uint32_t d1;
|
||||
uint32_t d2;
|
||||
uint32_t IW;
|
||||
uint32_t IH;
|
||||
uint32_t ID;
|
||||
uint32_t IC;
|
||||
uint32_t KW;
|
||||
uint32_t OH;
|
||||
uint32_t KD_KH_KW;
|
||||
uint32_t KH_KW;
|
||||
uint32_t IC_KD_KH_KW;
|
||||
uint32_t N_OD_OH;
|
||||
uint32_t OD_OH;
|
||||
uint32_t OD_OH_OW_IC_KD_KH_KW;
|
||||
uint32_t OH_OW_IC_KD_KH_KW;
|
||||
uint32_t OW_IC_KD_KH_KW;
|
||||
uint32_t misalign_offsets;
|
||||
};
|
||||
|
||||
struct vk_op_timestep_embedding_push_constants {
|
||||
uint32_t nb1;
|
||||
uint32_t dim;
|
||||
uint32_t max_period;
|
||||
};
|
||||
|
||||
struct vk_op_col2im_1d_push_constants {
|
||||
uint32_t T_out;
|
||||
uint32_t OC;
|
||||
uint32_t K_OC;
|
||||
uint32_t T_in;
|
||||
uint32_t K;
|
||||
int32_t stride;
|
||||
int32_t p0;
|
||||
};
|
||||
|
||||
struct vk_op_conv_transpose_1d_push_constants {
|
||||
uint32_t Cout;
|
||||
uint32_t Cin;
|
||||
uint32_t K;
|
||||
uint32_t L;
|
||||
uint32_t KL;
|
||||
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb11;
|
||||
uint32_t nb1;
|
||||
|
||||
int32_t s0;
|
||||
};
|
||||
|
||||
struct vk_op_snake_push_constants {
|
||||
uint32_t ne0;
|
||||
uint32_t ne1;
|
||||
};
|
||||
|
||||
struct vk_op_pool2d_push_constants {
|
||||
uint32_t IW; uint32_t IH;
|
||||
uint32_t OW; uint32_t OH;
|
||||
uint32_t OC;
|
||||
uint32_t pelements;
|
||||
uint32_t op;
|
||||
int32_t k0; int32_t k1;
|
||||
int32_t s0; int32_t s1;
|
||||
int32_t p0; int32_t p1;
|
||||
};
|
||||
|
||||
struct vk_op_rwkv_wkv6_push_constants {
|
||||
uint32_t B;
|
||||
uint32_t T;
|
||||
uint32_t C;
|
||||
uint32_t H;
|
||||
};
|
||||
|
||||
struct vk_op_rwkv_wkv7_push_constants {
|
||||
uint32_t B;
|
||||
uint32_t T;
|
||||
uint32_t C;
|
||||
uint32_t H;
|
||||
};
|
||||
|
||||
struct vk_op_gated_delta_net_push_constants {
|
||||
uint32_t H;
|
||||
uint32_t n_tokens;
|
||||
uint32_t n_seqs;
|
||||
uint32_t s_off;
|
||||
uint32_t sq1, sq2, sq3;
|
||||
uint32_t sv1, sv2, sv3;
|
||||
uint32_t sb1, sb2, sb3;
|
||||
uint32_t neq1, rq3;
|
||||
float scale;
|
||||
uint32_t K;
|
||||
};
|
||||
|
||||
struct vk_op_ssm_scan_push_constants {
|
||||
uint32_t nb02, nb03, nb12, nb13;
|
||||
uint32_t nb21, nb22, nb31;
|
||||
uint32_t nb42, nb43, nb52, nb53;
|
||||
uint32_t s_off;
|
||||
uint32_t n_head, d_head, n_group, n_tok;
|
||||
};
|
||||
|
||||
struct vk_op_ssm_conv_push_constants {
|
||||
uint32_t nb01, nb02;
|
||||
uint32_t nb11;
|
||||
uint32_t dst_nb0, dst_nb1, dst_nb2;
|
||||
uint32_t nc, ncs, nr, n_t, n_s;
|
||||
};
|
||||
|
||||
struct vk_op_conv2d_push_constants {
|
||||
uint32_t Cout;
|
||||
uint32_t Cin;
|
||||
uint32_t N;
|
||||
|
||||
uint32_t W;
|
||||
uint32_t H;
|
||||
uint32_t OW;
|
||||
uint32_t OH;
|
||||
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb03;
|
||||
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
|
||||
uint32_t nb1;
|
||||
uint32_t nb2;
|
||||
uint32_t nb3;
|
||||
|
||||
// init_fastdiv_values constants for dividing by OW, OW*OH
|
||||
uint32_t OWmp; uint32_t OWL;
|
||||
uint32_t OWOHmp; uint32_t OWOHL;
|
||||
};
|
||||
|
||||
template <> inline void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
|
||||
// Compute magic values to divide by OW, OW*OH
|
||||
init_fastdiv_values(p.OW, p.OWmp, p.OWL);
|
||||
init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL);
|
||||
}
|
||||
|
||||
struct vk_op_conv2d_dw_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t batches;
|
||||
uint32_t channels;
|
||||
uint32_t dst_w;
|
||||
uint32_t dst_h;
|
||||
uint32_t src_w;
|
||||
uint32_t src_h;
|
||||
uint32_t knl_w;
|
||||
uint32_t knl_h;
|
||||
int32_t stride_x;
|
||||
int32_t stride_y;
|
||||
int32_t pad_x;
|
||||
int32_t pad_y;
|
||||
int32_t dilation_x;
|
||||
int32_t dilation_y;
|
||||
};
|
||||
|
||||
struct vk_op_upscale_push_constants {
|
||||
uint32_t ne; uint32_t a_offset; uint32_t d_offset;
|
||||
uint32_t ne00; uint32_t ne01;
|
||||
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
|
||||
float sf0; float sf1; float sf2; float sf3;
|
||||
float pixel_offset;
|
||||
};
|
||||
|
||||
struct vk_op_sum_rows_push_constants
|
||||
{
|
||||
uint32_t n_cols;
|
||||
uint32_t ne01, ne02;
|
||||
uint32_t nb01, nb02, nb03;
|
||||
uint32_t nb11, nb12, nb13;
|
||||
float weight;
|
||||
uint32_t misalign_offsets;
|
||||
uint32_t ne0_12mp, ne0_12L;
|
||||
uint32_t ne0_1mp, ne0_1L;
|
||||
};
|
||||
|
||||
static vk_op_sum_rows_push_constants vk_op_sum_rows_push_constants_init(const ggml_tensor * src, const ggml_tensor * dst, int64_t n_cols) {
|
||||
uint32_t type_size = (uint32_t)ggml_type_size(src->type);
|
||||
vk_op_sum_rows_push_constants p = {};
|
||||
p.n_cols = (uint32_t)n_cols;
|
||||
p.ne01 = (uint32_t)src->ne[1];
|
||||
p.ne02 = (uint32_t)src->ne[2];
|
||||
p.nb01 = (uint32_t)src->nb[1] / type_size;
|
||||
p.nb02 = (uint32_t)src->nb[2] / type_size;
|
||||
p.nb03 = (uint32_t)src->nb[3] / type_size;
|
||||
p.nb11 = (uint32_t)dst->nb[1] / type_size;
|
||||
p.nb12 = (uint32_t)dst->nb[2] / type_size;
|
||||
p.nb13 = (uint32_t)dst->nb[3] / type_size;
|
||||
p.weight = 1.0f;
|
||||
return p;
|
||||
}
|
||||
|
||||
template <> inline void init_pushconst_fastdiv(vk_op_sum_rows_push_constants &p) {
|
||||
init_fastdiv_values(p.ne01*p.ne02, p.ne0_12mp, p.ne0_12L);
|
||||
init_fastdiv_values(p.ne01, p.ne0_1mp, p.ne0_1L);
|
||||
}
|
||||
|
||||
struct vk_quantize_q8_1_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t num_blocks;
|
||||
};
|
||||
|
||||
struct vk_op_flash_attn_split_k_reduce_push_constants {
|
||||
uint32_t D;
|
||||
uint32_t ne1;
|
||||
uint32_t ne2;
|
||||
uint32_t ne3;
|
||||
uint32_t k_num;
|
||||
uint32_t sinks;
|
||||
};
|
||||
|
||||
struct vk_op_flash_attn_mask_opt_push_constants {
|
||||
uint32_t nem0;
|
||||
uint32_t nem1;
|
||||
uint32_t nem2;
|
||||
uint32_t nbm1;
|
||||
uint32_t nbm2;
|
||||
uint32_t nbm3;
|
||||
uint32_t nbd1;
|
||||
uint32_t nbd2;
|
||||
uint32_t nbd3;
|
||||
};
|
||||
|
||||
template <typename T> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
GGML_UNUSED(p);
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
GGML_UNUSED(dst);
|
||||
static_assert(!std::is_const<T>::value, "unexpected type");
|
||||
GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0);
|
||||
GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0);
|
||||
GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0);
|
||||
GGML_ASSERT(!src3 || get_misalign_bytes(ctx, src3) == 0);
|
||||
GGML_ASSERT(!dst || get_misalign_bytes(ctx, dst) == 0);
|
||||
}
|
||||
|
||||
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_mat_vec_p021_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
p.b_offset = b_offset;
|
||||
p.d_offset = d_offset;
|
||||
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_mat_vec_nc_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
p.b_offset = b_offset;
|
||||
p.d_offset = d_offset;
|
||||
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_fwht_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
p.src_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
p.dst_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template <typename T> size_t push_constant_size(const T &t) {
|
||||
static_assert(std::is_class<T>::value, "T must be a struct/class");
|
||||
GGML_UNUSED(t);
|
||||
return sizeof(T);
|
||||
}
|
||||
|
||||
template <typename T> size_t push_constant_size(const std::vector<T> &t) {
|
||||
GGML_UNUSED(t);
|
||||
return sizeof(T) * t.size();
|
||||
}
|
||||
|
||||
template <typename T, uint32_t N> size_t push_constant_size(const std::array<T, N> &t) {
|
||||
GGML_UNUSED(t);
|
||||
return sizeof(T) * N;
|
||||
}
|
||||
|
||||
template <typename T> const T *push_constant_data(const T &t) {
|
||||
static_assert(std::is_class<T>::value, "T must be a struct/class");
|
||||
return &t;
|
||||
}
|
||||
|
||||
template <typename T> const T *push_constant_data(const std::vector<T> &t) {
|
||||
return t.data();
|
||||
}
|
||||
|
||||
template <typename T, uint32_t N> const T *push_constant_data(const std::array<T, N> &t) {
|
||||
return t.data();
|
||||
}
|
||||
|
||||
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
p.misalign_offsets = (a_offset << 16) | d_offset;
|
||||
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_glu_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
const uint32_t b_offset = src1 ? get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type) : a_offset;
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
GGML_ASSERT(a_offset < (1u << 8));
|
||||
GGML_ASSERT(b_offset < (1u << 8));
|
||||
GGML_ASSERT(d_offset < (1u << 8));
|
||||
|
||||
p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
|
||||
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
p.misalign_offsets = (a_offset << 16) | d_offset;
|
||||
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_pad_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
p.misalign_offsets = (a_offset << 16) | d_offset;
|
||||
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_im2col_3d_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
p.misalign_offsets = (a_offset << 16) | d_offset;
|
||||
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0));
|
||||
|
||||
p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
|
||||
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
p.a_offset = a_offset;
|
||||
p.d_offset = d_offset;
|
||||
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_rope_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
p.a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
p.d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
2704
ggml/src/ggml-vulkan/ggml-vulkan-shaders.cpp
Normal file
2704
ggml/src/ggml-vulkan/ggml-vulkan-shaders.cpp
Normal file
File diff suppressed because it is too large
Load Diff
1341
ggml/src/ggml-vulkan/ggml-vulkan-types.h
Normal file
1341
ggml/src/ggml-vulkan/ggml-vulkan-types.h
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user