mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
2894 lines
130 KiB
C++
2894 lines
130 KiB
C++
#include "ggml-vulkan-common.h"
|
|
|
|
void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, const vk_subbuffer & in, const vk_subbuffer & out) {
|
|
VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
|
|
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
|
|
|
|
const uint32_t ne = ggml_nelements(tensor);
|
|
std::array<uint32_t, 3> elements;
|
|
|
|
if (ne > 262144) {
|
|
elements = { 512, 512, CEIL_DIV(ne, 262144) };
|
|
} else if (ne > 512) {
|
|
elements = { 512, CEIL_DIV(ne, 512), 1 };
|
|
} else {
|
|
elements = { ne, 1, 1 };
|
|
}
|
|
|
|
vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne);
|
|
pc.nb10 = 1;
|
|
pc.nb11 = (uint32_t)tensor->ne[0];
|
|
pc.nb12 = (uint32_t)(tensor->ne[0] * tensor->ne[1]);
|
|
pc.nb13 = (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]);
|
|
init_pushconst_fastdiv(pc);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
|
|
ggml_vk_sync_buffers(ctx, subctx);
|
|
}
|
|
|
|
static int ggml_vk_fwht_pipeline_idx(int64_t n) {
|
|
switch (n) {
|
|
case 64: return 0;
|
|
case 128: return 1;
|
|
case 256: return 2;
|
|
case 512: return 3;
|
|
default: return -1;
|
|
}
|
|
}
|
|
|
|
bool ggml_vk_can_use_fwht(const ggml_backend_vk_context * ctx, const ggml_tensor * src1, const ggml_tensor * dst) {
|
|
if (ctx->num_additional_fused_ops != 0) {
|
|
return false;
|
|
}
|
|
|
|
if (ggml_get_op_params_i32(dst, 1) != GGML_HINT_SRC0_IS_HADAMARD) {
|
|
return false;
|
|
}
|
|
|
|
const int idx = ggml_vk_fwht_pipeline_idx(src1->ne[0]);
|
|
if (idx < 0 || ctx->device->pipeline_fwht_f32[idx] == nullptr) {
|
|
return false;
|
|
}
|
|
|
|
if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
|
return false;
|
|
}
|
|
|
|
if (!ggml_is_contiguous(src1)) {
|
|
return false;
|
|
}
|
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
|
|
|
return true;
|
|
}
|
|
|
|
void ggml_vk_fwht(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src, ggml_tensor * dst) {
|
|
const int idx = ggml_vk_fwht_pipeline_idx(src->ne[0]);
|
|
vk_pipeline pipeline = ctx->device->pipeline_fwht_f32[idx];
|
|
|
|
const uint32_t rows_per_workgroup = 4;
|
|
const uint32_t n_rows = (uint32_t)ggml_nrows(src);
|
|
const uint32_t max_workgroups_x = ctx->device->properties.limits.maxComputeWorkGroupCount[0];
|
|
|
|
const uint32_t total_workgroups = CEIL_DIV(n_rows, rows_per_workgroup);
|
|
const uint32_t workgroups_x = std::min(total_workgroups, max_workgroups_x);
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
|
|
const vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src, true);
|
|
const vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, true);
|
|
|
|
vk_op_fwht_push_constants pc = {
|
|
n_rows,
|
|
0,
|
|
0,
|
|
1.0f / std::sqrt((float)src->ne[0]),
|
|
};
|
|
init_pushconst_tensor_offsets(ctx, pc, src, nullptr, nullptr, nullptr, dst);
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src_buf, dst_buf }, pc, { workgroups_x, 1, 1 });
|
|
}
|
|
|
|
static vk_conv_shapes ggml_vk_conv_select_shape(ggml_backend_vk_context * ctx, uint32_t K, uint32_t NPQ) {
|
|
auto n_tiles = [&](vk_conv_shapes s) {
|
|
return CEIL_DIV(K, vk_conv_block_sizes[s].K)
|
|
* CEIL_DIV(NPQ, vk_conv_block_sizes[s].NPQ);
|
|
};
|
|
|
|
// We can't query number of shader cores on Intel, use 32 as a placeholder
|
|
// so small convolutions will still choose a smaller tile.
|
|
const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;
|
|
|
|
// 128x128 isn't used with cm1 due to shared memory size; fall through to a smaller tile.
|
|
bool allow_128x128 = true;
|
|
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
|
if (!ctx->device->coopmat2 && ctx->device->coopmat_support && ctx->device->coopmat_support_16x16x16_f16acc) {
|
|
allow_128x128 = false;
|
|
}
|
|
#endif
|
|
|
|
if (allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) {
|
|
return CONV_SHAPE_128x128;
|
|
} else if (K <= 32 && n_tiles(CONV_SHAPE_32x256) >= shader_core_count * 2) {
|
|
return CONV_SHAPE_32x256;
|
|
} else if (K <= 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) {
|
|
return CONV_SHAPE_64x128;
|
|
} else if (!allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) {
|
|
// cm1 fallback for large K when 128x128 isn't available
|
|
return CONV_SHAPE_64x128;
|
|
} else {
|
|
return CONV_SHAPE_64x32;
|
|
}
|
|
}
|
|
|
|
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * dst, ggml_op op) {
|
|
switch (op) {
|
|
case GGML_OP_GET_ROWS:
|
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
|
if (src0->type == GGML_TYPE_I32) {
|
|
// i32 src only supports i32 result
|
|
GGML_ASSERT(dst->type == GGML_TYPE_I32);
|
|
return ctx->device->pipeline_get_rows[src0->type];
|
|
}
|
|
if (dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_get_rows[src0->type];
|
|
}
|
|
if (dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_get_rows_f32[src0->type];
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_ACC:
|
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_acc_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_SET:
|
|
if (src0->type == src1->type && src0->type == dst->type &&
|
|
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32)) {
|
|
return ctx->device->pipeline_set_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_ADD:
|
|
case GGML_OP_SUB:
|
|
case GGML_OP_MUL:
|
|
case GGML_OP_DIV:
|
|
if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) ||
|
|
(src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) ||
|
|
(dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16)) {
|
|
return nullptr;
|
|
}
|
|
switch (op) {
|
|
case GGML_OP_ADD:
|
|
{
|
|
if (ctx->num_additional_fused_ops > 0) {
|
|
if (ctx->do_add_rms_partials) {
|
|
return ctx->device->pipeline_multi_add_rms[ctx->num_additional_fused_ops];
|
|
} else {
|
|
return ctx->device->pipeline_multi_add[ctx->num_additional_fused_ops];
|
|
}
|
|
}
|
|
if (ctx->do_add_rms_partials) {
|
|
auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_rms_norepeat : ctx->device->pipeline_add_rms;
|
|
return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16];
|
|
} else {
|
|
auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_norepeat : ctx->device->pipeline_add;
|
|
return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16];
|
|
}
|
|
}
|
|
case GGML_OP_SUB:
|
|
{
|
|
auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_sub_norepeat : ctx->device->pipeline_sub;
|
|
return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16];
|
|
}
|
|
case GGML_OP_MUL:
|
|
{
|
|
auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_mul_norepeat : ctx->device->pipeline_mul;
|
|
return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16];
|
|
}
|
|
case GGML_OP_DIV:
|
|
{
|
|
auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_div_norepeat : ctx->device->pipeline_div;
|
|
return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16];
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_ADD_ID:
|
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && src2->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_add_id_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_CONCAT: {
|
|
if (src0->type != src1->type || src0->type != dst->type) {
|
|
return nullptr;
|
|
}
|
|
if (ggml_blck_size(src0->type) != 1) {
|
|
return nullptr;
|
|
}
|
|
const size_t type_size = ggml_type_size(src0->type);
|
|
switch (type_size) {
|
|
case 1:
|
|
return ctx->device->pipeline_concat_i8;
|
|
case 2:
|
|
return ctx->device->pipeline_concat_i16;
|
|
case 4:
|
|
return ctx->device->pipeline_concat_i32;
|
|
case 8:
|
|
return ctx->device->pipeline_concat_i64;
|
|
default:
|
|
return nullptr;
|
|
}
|
|
}
|
|
case GGML_OP_UPSCALE:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
uint32_t mode = (ggml_get_op_params_i32(dst, 0) & (0xFF | GGML_SCALE_FLAG_ANTIALIAS));
|
|
switch (mode) {
|
|
case GGML_SCALE_MODE_NEAREST:
|
|
return ctx->device->pipeline_upscale_nearest_f32;
|
|
case GGML_SCALE_MODE_BILINEAR:
|
|
return ctx->device->pipeline_upscale_bilinear_f32;
|
|
case GGML_SCALE_MODE_BICUBIC:
|
|
return ctx->device->pipeline_upscale_bicubic_f32;
|
|
case GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS:
|
|
return ctx->device->pipeline_upscale_bilinear_antialias_f32;
|
|
default:
|
|
return nullptr;
|
|
}
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_SCALE:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_scale_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_SQR:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_sqr_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_SQRT:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_sqrt_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_SIN:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_sin_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_COS:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_cos_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_LOG:
|
|
if (src0->type == dst->type &&
|
|
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
|
|
return ctx->device->pipeline_log[dst->type == GGML_TYPE_F16];
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_TRI:
|
|
if (src0->type == dst->type &&
|
|
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
|
|
return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16];
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_DIAG:
|
|
if (src0->type == dst->type &&
|
|
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
|
|
return ctx->device->pipeline_diag[dst->type == GGML_TYPE_F16];
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_CLAMP:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_clamp_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_PAD:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_pad_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_ROLL:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_roll_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_REPEAT:
|
|
if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) {
|
|
return ctx->device->pipeline_repeat_i32;
|
|
}
|
|
if (ggml_type_size(src0->type) == 2 && ggml_type_size(dst->type) == 2) {
|
|
return ctx->device->pipeline_repeat_i16;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_REPEAT_BACK:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_repeat_back_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_CPY:
|
|
case GGML_OP_CONT:
|
|
case GGML_OP_DUP:
|
|
return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type);
|
|
case GGML_OP_SET_ROWS:
|
|
if (src1->type == GGML_TYPE_I64) {
|
|
return ctx->device->pipeline_set_rows_i64[dst->type];
|
|
} else {
|
|
return ctx->device->pipeline_set_rows_i32[dst->type];
|
|
}
|
|
case GGML_OP_SILU_BACK:
|
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_silu_back_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_NORM:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_norm_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_GROUP_NORM:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_group_norm_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_RMS_NORM:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
if (ctx->do_add_rms_partials) {
|
|
return ctx->num_additional_fused_ops > 0 ? ctx->device->pipeline_rms_norm_mul_partials_f32 : ctx->device->pipeline_rms_norm_partials_f32;
|
|
} else {
|
|
return ctx->num_additional_fused_ops > 0 ? ctx->device->pipeline_rms_norm_mul_f32 : ctx->device->pipeline_rms_norm_f32;
|
|
}
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_RMS_NORM_BACK:
|
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_rms_norm_back_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_L2_NORM:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_l2_norm_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_UNARY:
|
|
if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) ||
|
|
(dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) ||
|
|
(src0->type != dst->type)) {
|
|
return nullptr;
|
|
}
|
|
|
|
switch (ggml_get_unary_op(dst)) {
|
|
case GGML_UNARY_OP_EXP:
|
|
return ctx->device->pipeline_exp[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_EXPM1:
|
|
return ctx->device->pipeline_expm1[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_ELU:
|
|
return ctx->device->pipeline_elu[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_SILU:
|
|
return ctx->device->pipeline_silu[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_GELU:
|
|
return ctx->device->pipeline_gelu[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_GELU_ERF:
|
|
return ctx->device->pipeline_gelu_erf[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
return ctx->device->pipeline_gelu_quick[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_RELU:
|
|
return ctx->device->pipeline_relu[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_XIELU:
|
|
return ctx->device->pipeline_xielu[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_NEG:
|
|
return ctx->device->pipeline_neg[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_TANH:
|
|
return ctx->device->pipeline_tanh[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_SIGMOID:
|
|
return ctx->device->pipeline_sigmoid[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_HARDSIGMOID:
|
|
return ctx->device->pipeline_hardsigmoid[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_HARDSWISH:
|
|
return ctx->device->pipeline_hardswish[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_ABS:
|
|
return ctx->device->pipeline_abs[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_SOFTPLUS:
|
|
return ctx->device->pipeline_softplus[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_STEP:
|
|
return ctx->device->pipeline_step[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_ROUND:
|
|
return ctx->device->pipeline_round[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_CEIL:
|
|
return ctx->device->pipeline_ceil[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_FLOOR:
|
|
return ctx->device->pipeline_floor[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_TRUNC:
|
|
return ctx->device->pipeline_trunc[dst->type == GGML_TYPE_F16];
|
|
case GGML_UNARY_OP_SGN:
|
|
return ctx->device->pipeline_sgn[dst->type == GGML_TYPE_F16];
|
|
default:
|
|
break;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_GLU:
|
|
if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) ||
|
|
(dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) ||
|
|
(src0->type != dst->type)) {
|
|
return nullptr;
|
|
}
|
|
|
|
switch (ggml_get_glu_op(dst)) {
|
|
case GGML_GLU_OP_GEGLU:
|
|
return ctx->device->pipeline_geglu[dst->type == GGML_TYPE_F16];
|
|
case GGML_GLU_OP_REGLU:
|
|
return ctx->device->pipeline_reglu[dst->type == GGML_TYPE_F16];
|
|
case GGML_GLU_OP_SWIGLU:
|
|
return ctx->device->pipeline_swiglu[dst->type == GGML_TYPE_F16];
|
|
case GGML_GLU_OP_SWIGLU_OAI:
|
|
return ctx->device->pipeline_swiglu_oai[dst->type == GGML_TYPE_F16];
|
|
case GGML_GLU_OP_GEGLU_ERF:
|
|
return ctx->device->pipeline_geglu_erf[dst->type == GGML_TYPE_F16];
|
|
case GGML_GLU_OP_GEGLU_QUICK:
|
|
return ctx->device->pipeline_geglu_quick[dst->type == GGML_TYPE_F16];
|
|
default:
|
|
break;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_DIAG_MASK_INF:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_diag_mask_inf_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_SOFT_MAX:
|
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
|
|
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32);
|
|
|
|
if (ctx->num_additional_fused_ops) {
|
|
uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
|
|
GGML_ASSERT(idx < num_topk_moe_pipelines);
|
|
// use n_experts from push constant if it's not equal to the power of two spec constant
|
|
bool use_push = dst->ne[0] != (1u << idx);
|
|
return ctx->device->pipeline_topk_moe[idx][use_push];
|
|
}
|
|
|
|
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
|
|
return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_wg512 : ctx->device->pipeline_soft_max_f32;
|
|
}
|
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
|
return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_f16_wg512 : ctx->device->pipeline_soft_max_f32_f16;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_SOFT_MAX_BACK:
|
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_soft_max_back_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_ROPE:
|
|
case GGML_OP_ROPE_BACK:
|
|
{
|
|
const ggml_tensor *rope = ctx->num_additional_fused_ops == 2 ? dst->src[0]->src[0] : dst;
|
|
const int mode = ((const int32_t *) rope->op_params)[2];
|
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
|
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
|
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
|
|
|
if (is_neox) {
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_rope_neox_f32;
|
|
}
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_rope_neox_f32_f16;
|
|
}
|
|
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_rope_neox_f16;
|
|
}
|
|
} else if (is_mrope && !is_vision) {
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_rope_multi_f32;
|
|
}
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_rope_multi_f32_f16;
|
|
}
|
|
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_rope_multi_f16;
|
|
}
|
|
} else if (is_vision) {
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_rope_vision_f32;
|
|
}
|
|
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_rope_vision_f16;
|
|
}
|
|
} else {
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_rope_norm_f32;
|
|
}
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_rope_norm_f32_f16;
|
|
}
|
|
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_rope_norm_f16;
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
case GGML_OP_SUM:
|
|
case GGML_OP_SUM_ROWS:
|
|
case GGML_OP_MEAN:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_sum_rows_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_CUMSUM:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
if (src0->ne[0] <= 512) {
|
|
return ctx->device->pipeline_cumsum_small_f32;
|
|
} else {
|
|
return ctx->device->pipeline_cumsum_f32;
|
|
}
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_SOLVE_TRI:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
|
|
vk_solve_tri_pipeline_state solve_tri_pipeline_state(src0->ne[0], src1->ne[0]);
|
|
|
|
vk_pipeline pipeline = nullptr;
|
|
|
|
{
|
|
std::lock_guard<std::mutex> guard(ctx->device->compile_mutex);
|
|
auto it = ctx->device->pipeline_solve_tri_f32.find(solve_tri_pipeline_state);
|
|
if (it != ctx->device->pipeline_solve_tri_f32.end()) {
|
|
pipeline = it->second;
|
|
} else {
|
|
ctx->device->pipeline_solve_tri_f32[solve_tri_pipeline_state] = pipeline = std::make_shared<vk_pipeline_struct>();
|
|
}
|
|
}
|
|
|
|
return pipeline;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_ARGMAX:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_I32) {
|
|
return ctx->device->pipeline_argmax_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_COUNT_EQUAL:
|
|
if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I64) {
|
|
return ctx->device->pipeline_count_equal_i32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_IM2COL:
|
|
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_im2col_f32;
|
|
}
|
|
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_im2col_f32_f16;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_IM2COL_3D:
|
|
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_im2col_3d_f32;
|
|
}
|
|
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_im2col_3d_f32_f16;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_timestep_embedding_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_conv_transpose_1d_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_COL2IM_1D:
|
|
switch (src0->type) {
|
|
case GGML_TYPE_F32: return ctx->device->pipeline_col2im_1d_f32;
|
|
case GGML_TYPE_F16: return ctx->device->pipeline_col2im_1d_f16;
|
|
case GGML_TYPE_BF16: return ctx->device->pipeline_col2im_1d_bf16;
|
|
default: return nullptr;
|
|
}
|
|
case GGML_OP_POOL_2D:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_pool2d_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_RWKV_WKV6:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_rwkv_wkv6_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_RWKV_WKV7:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_rwkv_wkv7_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_GATED_DELTA_NET:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
const uint32_t S_v = dst->src[2]->ne[0];
|
|
const uint32_t kda = (dst->src[3]->ne[0] == (int64_t)S_v) ? 1 : 0;
|
|
uint32_t si;
|
|
switch (S_v) {
|
|
case 16: si = 0; break;
|
|
case 32: si = 1; break;
|
|
case 64: si = 2; break;
|
|
case 128: si = 3; break;
|
|
default: return nullptr;
|
|
}
|
|
return ctx->device->pipeline_gated_delta_net[si][kda];
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_SSM_SCAN:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
const uint32_t d_state = src0->ne[0];
|
|
if (d_state == 128) {
|
|
return ctx->device->pipeline_ssm_scan_f32_d128;
|
|
} else if (d_state == 256) {
|
|
return ctx->device->pipeline_ssm_scan_f32_d256;
|
|
}
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_SSM_CONV:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
switch (ctx->num_additional_fused_ops) {
|
|
case 0: return ctx->device->pipeline_ssm_conv_f32;
|
|
case 1: return ctx->device->pipeline_ssm_conv_silu_f32;
|
|
case 2: return ctx->device->pipeline_ssm_conv_bias_silu_f32;
|
|
default: return nullptr;
|
|
}
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_OPT_STEP_ADAMW:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_opt_step_adamw_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_OPT_STEP_SGD:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_opt_step_sgd_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_LEAKY_RELU:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_leaky_relu_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_CONV_2D:
|
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
|
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
uint32_t K = dst->ne[2]; // Cout
|
|
uint32_t NPQ = dst->ne[3] * dst->ne[1] * dst->ne[0]; // N * OH * OW
|
|
vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, K, NPQ);
|
|
|
|
bool transpose = dst->op == GGML_OP_CONV_TRANSPOSE_2D;
|
|
uint32_t KW = (uint32_t)src0->ne[0];
|
|
uint32_t KH = (uint32_t)src0->ne[1];
|
|
uint32_t s0 = (uint32_t)(ggml_get_op_params_i32(dst, 0));
|
|
uint32_t s1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 1) : s0;
|
|
uint32_t p0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 2) : 0;
|
|
uint32_t p1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 3) : 0;
|
|
uint32_t d0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 4) : 1;
|
|
uint32_t d1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 5) : 1;
|
|
|
|
// tile-aligned shapes let the shader skip bounds checks
|
|
const uint32_t Cin = (uint32_t)src1->ne[2];
|
|
const uint32_t CRS = Cin * KW * KH;
|
|
const uint32_t BS_K = vk_conv_block_sizes[shape].K;
|
|
const uint32_t BS_CRS = vk_conv_block_sizes[shape].CRS;
|
|
const uint32_t BS_NPQ = vk_conv_block_sizes[shape].NPQ;
|
|
const uint32_t aligned = ((K % BS_K == 0) &&
|
|
(CRS % BS_CRS == 0) &&
|
|
(NPQ % BS_NPQ == 0)) ? 1u : 0u;
|
|
|
|
vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH, aligned);
|
|
|
|
std::map<vk_conv2d_pipeline_state, vk_pipeline> *pipelines = nullptr;
|
|
if (op == GGML_OP_CONV_2D) {
|
|
if (src0->type == GGML_TYPE_F32) {
|
|
pipelines = &ctx->device->pipeline_conv2d_f32[shape];
|
|
} else if (src0->type == GGML_TYPE_F16) {
|
|
pipelines = &ctx->device->pipeline_conv2d_f16_f32[shape];
|
|
}
|
|
} else if (op == GGML_OP_CONV_TRANSPOSE_2D) {
|
|
if (src0->type == GGML_TYPE_F32) {
|
|
pipelines = &ctx->device->pipeline_conv_transpose_2d_f32[shape];
|
|
} else if (src0->type == GGML_TYPE_F16) {
|
|
pipelines = &ctx->device->pipeline_conv_transpose_2d_f16_f32[shape];
|
|
}
|
|
}
|
|
|
|
vk_pipeline pipeline = nullptr;
|
|
|
|
{
|
|
std::lock_guard<std::mutex> guard(ctx->device->compile_mutex);
|
|
auto it = pipelines->find(conv2d_pipeline_state);
|
|
if (it != pipelines->end()) {
|
|
pipeline = it->second;
|
|
} else {
|
|
(*pipelines)[conv2d_pipeline_state] = pipeline = std::make_shared<vk_pipeline_struct>();
|
|
}
|
|
}
|
|
|
|
return pipeline;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_CONV_2D_DW:
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
if (ggml_is_contiguous(src1)) {
|
|
return ctx->device->pipeline_conv2d_dw_whcn_f32;
|
|
} else if (ggml_is_contiguous_channels(src1)) {
|
|
return ctx->device->pipeline_conv2d_dw_cwhn_f32;
|
|
}
|
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
|
if (ggml_is_contiguous(src1)) {
|
|
return ctx->device->pipeline_conv2d_dw_whcn_f16_f32;
|
|
} else if (ggml_is_contiguous_channels(src1)) {
|
|
return ctx->device->pipeline_conv2d_dw_cwhn_f16_f32;
|
|
}
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_ADD1:
|
|
if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_add1_f16_f16;
|
|
}
|
|
if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_add1_f16_f32;
|
|
}
|
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_add1_f32_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_ARANGE:
|
|
if (dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_arange_f32;
|
|
}
|
|
return nullptr;
|
|
case GGML_OP_FILL:
|
|
if (dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_fill_f32;
|
|
}
|
|
if (dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_fill_f16;
|
|
}
|
|
return nullptr;
|
|
default:
|
|
return nullptr;
|
|
}
|
|
|
|
GGML_UNUSED(src2);
|
|
}
|
|
|
|
template<typename PC>
|
|
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc) {
|
|
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
if (src1 != nullptr) {
|
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
}
|
|
if (src2 != nullptr) {
|
|
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
|
}
|
|
if (src3 != nullptr) {
|
|
std::cerr << "), (" << src3 << ", name=" << src3->name << ", type=" << src3->type << ", ne0=" << src3->ne[0] << ", ne1=" << src3->ne[1] << ", ne2=" << src3->ne[2] << ", ne3=" << src3->ne[3] << ", nb0=" << src3->nb[0] << ", nb1=" << src3->nb[1] << ", nb2=" << src3->nb[2] << ", nb3=" << src3->nb[3];
|
|
}
|
|
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
|
std::cerr << "), " << ggml_op_name(op) << ")");
|
|
GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
|
GGML_ASSERT(dst->buffer != nullptr);
|
|
const uint64_t ne00 = src0->ne[0];
|
|
const uint64_t ne01 = src0->ne[1];
|
|
const uint64_t ne02 = src0->ne[2];
|
|
const uint64_t ne03 = src0->ne[3];
|
|
|
|
const bool use_src1 = src1 != nullptr;
|
|
const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
|
|
const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
|
|
const uint64_t ne12 = use_src1 ? src1->ne[2] : 0;
|
|
const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
|
|
|
|
const bool use_src2 = src2 != nullptr;
|
|
const bool use_src3 = src3 != nullptr;
|
|
|
|
init_pushconst_fastdiv(pc);
|
|
|
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
|
|
|
|
if (pipeline == nullptr) {
|
|
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type);
|
|
if (src1 != nullptr) {
|
|
std::cerr << " and " << ggml_type_name(src1->type);
|
|
}
|
|
std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
|
|
GGML_ABORT("fatal error");
|
|
}
|
|
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
|
|
vk_subbuffer src0_buf = ggml_vk_tensor_subbuffer(ctx, src0, true);
|
|
vk_subbuffer src1_buf = use_src1 ? ggml_vk_tensor_subbuffer(ctx, src1, true) : vk_subbuffer{};
|
|
vk_subbuffer src2_buf = use_src2 ? ggml_vk_tensor_subbuffer(ctx, src2, true) : vk_subbuffer{};
|
|
vk_subbuffer src3_buf = use_src3 ? ggml_vk_tensor_subbuffer(ctx, src3, true) : vk_subbuffer{};
|
|
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, true);
|
|
|
|
// Compute misalignment offset for descriptors and store it in in push constants.
|
|
init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, src3, dst);
|
|
|
|
std::array<uint32_t, 3> elements;
|
|
|
|
switch (op) {
|
|
case GGML_OP_NORM:
|
|
case GGML_OP_RMS_NORM_BACK:
|
|
case GGML_OP_L2_NORM:
|
|
case GGML_OP_SOFT_MAX:
|
|
case GGML_OP_SOFT_MAX_BACK:
|
|
case GGML_OP_SUM_ROWS:
|
|
case GGML_OP_CUMSUM:
|
|
case GGML_OP_MEAN:
|
|
case GGML_OP_ARGMAX:
|
|
{
|
|
const uint32_t nr = ggml_nrows(src0);
|
|
if (nr > 262144) {
|
|
elements = { 512, 512, CEIL_DIV(nr, 262144) };
|
|
} else if (nr > 512) {
|
|
elements = { 512, CEIL_DIV(nr, 512), 1 };
|
|
} else {
|
|
elements = { nr, 1, 1 };
|
|
}
|
|
} break;
|
|
case GGML_OP_SOLVE_TRI:
|
|
{
|
|
uint32_t nr = (uint32_t)(ne02 * ne03);
|
|
if (nr > 262144) {
|
|
elements = { 512, 512, CEIL_DIV(nr, 262144) };
|
|
} else if (nr > 512) {
|
|
elements = { 512, CEIL_DIV(nr, 512), 1 };
|
|
} else {
|
|
elements = { nr, 1, 1 };
|
|
}
|
|
}
|
|
break;
|
|
case GGML_OP_RMS_NORM:
|
|
if (ctx->do_add_rms_partials) {
|
|
// Run one element per thread, 128 threads per workgroup
|
|
elements = { (uint32_t)CEIL_DIV(ne00, 128), 1, 1 };
|
|
} else {
|
|
elements = { (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne03 };
|
|
}
|
|
break;
|
|
|
|
case GGML_OP_SUM:
|
|
// We use GGML_OP_SUM_ROWS with 1 row.
|
|
elements = { 1, 1, 1 };
|
|
break;
|
|
case GGML_OP_GROUP_NORM:
|
|
{
|
|
const uint32_t num_groups = dst->op_params[0];
|
|
elements = { num_groups * (uint32_t)src0->ne[3], 1, 1 };
|
|
} break;
|
|
case GGML_OP_DIAG_MASK_INF:
|
|
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
|
|
break;
|
|
case GGML_OP_ROPE:
|
|
case GGML_OP_ROPE_BACK:
|
|
{
|
|
uint32_t nrows = (uint32_t)ggml_nrows(src0);
|
|
uint32_t z = 1;
|
|
if (nrows > ctx->device->properties.limits.maxComputeWorkGroupCount[0]) {
|
|
z = CEIL_DIV(nrows, 32768);
|
|
nrows = 32768;
|
|
}
|
|
elements = { nrows, (uint32_t)ne00, z };
|
|
|
|
} break;
|
|
case GGML_OP_GET_ROWS:
|
|
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
|
elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
|
|
elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
|
|
break;
|
|
case GGML_OP_ARGSORT:
|
|
GGML_ASSERT(0);
|
|
break;
|
|
case GGML_OP_IM2COL:
|
|
{
|
|
const bool is_2D = dst->op_params[6] == 1;
|
|
|
|
const uint32_t IC = src1->ne[is_2D ? 2 : 1];
|
|
|
|
const uint32_t KH = is_2D ? src0->ne[1] : 1;
|
|
const uint32_t KW = src0->ne[0];
|
|
|
|
const uint32_t OH = is_2D ? dst->ne[2] : 1;
|
|
const uint32_t OW = dst->ne[1];
|
|
|
|
const uint32_t batch = src1->ne[is_2D ? 3 : 2];
|
|
|
|
const uint32_t CHW = IC * KH * KW;
|
|
// Cap X workgroups to limit concurrent IC channel reads.
|
|
// The shader loops over X to cover the full CHW dimension.
|
|
// AMD prefers a lower limit
|
|
const uint32_t min_cap = ctx->device->vendor_id == VK_VENDOR_ID_AMD ? 512u : 4096u;
|
|
const uint32_t x_elements = std::min(CHW, std::max(min_cap, OW * KH * KW));
|
|
elements = { x_elements, OW, OH * batch };
|
|
elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
|
|
elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
|
|
} break;
|
|
case GGML_OP_IM2COL_3D:
|
|
{
|
|
const uint32_t IC = ((const uint32_t *)(dst->op_params))[9];
|
|
|
|
const uint32_t N = ne13 / IC;
|
|
|
|
const uint32_t KD = ne02;
|
|
const uint32_t KH = ne01;
|
|
const uint32_t KW = ne00;
|
|
|
|
const uint32_t OD = dst->ne[3] / N;
|
|
const uint32_t OH = dst->ne[2];
|
|
const uint32_t OW = dst->ne[1];
|
|
|
|
const uint32_t IC_KD_KH_KW = IC*KD*KH*KW;
|
|
const uint32_t N_OD_OH = N*OD*OH;
|
|
|
|
elements = { IC_KD_KH_KW, OW, N_OD_OH };
|
|
elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
|
|
} break;
|
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
{
|
|
const uint32_t dim = dst->op_params[0];
|
|
uint32_t half_ceil = (dim + 1) / 2;
|
|
elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
|
|
} break;
|
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
{
|
|
elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1}
|
|
} break;
|
|
case GGML_OP_COL2IM_1D:
|
|
{
|
|
elements = { uint32_t(dst->ne[0]), uint32_t(dst->ne[1]), 1 };
|
|
} break;
|
|
case GGML_OP_POOL_2D:
|
|
{
|
|
const uint32_t N = dst->ne[3];
|
|
const uint32_t OC = dst->ne[2];
|
|
const uint32_t OH = dst->ne[1];
|
|
const uint32_t OW = dst->ne[0];
|
|
elements = { N * OC * OH * OW, 1, 1};
|
|
} break;
|
|
case GGML_OP_CONV_2D:
|
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
|
if constexpr (std::is_same_v<PC, vk_op_conv2d_push_constants>) {
|
|
const uint32_t NPQ = pc.N * pc.OH * pc.OW;
|
|
const vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, pc.Cout, NPQ);
|
|
const uint32_t NPQ_blocks = CEIL_DIV(NPQ, vk_conv_block_sizes[shape].NPQ);
|
|
|
|
elements = { pc.Cout, NPQ_blocks, 1 };
|
|
if (elements[1] > 512) {
|
|
elements[2] = CEIL_DIV(elements[1], 512);
|
|
elements[1] = 512;
|
|
}
|
|
} else {
|
|
GGML_ABORT("invalid push constant type for CONV_2D");
|
|
}
|
|
break;
|
|
case GGML_OP_ADD:
|
|
case GGML_OP_SUB:
|
|
case GGML_OP_DIV:
|
|
case GGML_OP_MUL:
|
|
case GGML_OP_ADD1:
|
|
case GGML_OP_ARANGE:
|
|
case GGML_OP_FILL:
|
|
case GGML_OP_SCALE:
|
|
case GGML_OP_SQR:
|
|
case GGML_OP_SQRT:
|
|
case GGML_OP_SIN:
|
|
case GGML_OP_COS:
|
|
case GGML_OP_LOG:
|
|
case GGML_OP_TRI:
|
|
case GGML_OP_DIAG:
|
|
case GGML_OP_CLAMP:
|
|
case GGML_OP_PAD:
|
|
case GGML_OP_ROLL:
|
|
case GGML_OP_REPEAT:
|
|
case GGML_OP_REPEAT_BACK:
|
|
case GGML_OP_CPY:
|
|
case GGML_OP_CONCAT:
|
|
case GGML_OP_UPSCALE:
|
|
case GGML_OP_UNARY:
|
|
case GGML_OP_GLU:
|
|
case GGML_OP_CONV_2D_DW:
|
|
{
|
|
uint32_t ne = ggml_nelements(dst);
|
|
if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
|
|
// Convert from number of logical elements to 2- or 4-byte units.
|
|
ne /= ggml_blck_size(src0->type);
|
|
if ((ggml_type_size(src0->type) % 4) == 0) {
|
|
ne *= ggml_type_size(src0->type) / 4;
|
|
} else {
|
|
ne *= ggml_type_size(src0->type) / 2;
|
|
}
|
|
}
|
|
// copy_to_quant has block size of 32, and each thread does QUANT_K elements.
|
|
// Splitting into 512x512xZ wouldn't work well since each workgroup does 1024 elements.
|
|
// So divide by block size here before splitting into 512x512 groups.
|
|
if (op == GGML_OP_CPY && !ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
|
|
ne = CEIL_DIV(ne, ggml_blck_size(dst->type));
|
|
}
|
|
if (ne > 262144) {
|
|
elements = { 512, 512, CEIL_DIV(ne, 262144) };
|
|
} else if (ne > 512) {
|
|
elements = { 512, CEIL_DIV(ne, 512), 1 };
|
|
} else {
|
|
elements = { ne, 1, 1 };
|
|
}
|
|
|
|
if (pipeline == ctx->device->pipeline_cpy_transpose_32 ||
|
|
pipeline == ctx->device->pipeline_cpy_transpose_16) {
|
|
// 32x32 tiles
|
|
elements[0] = (uint32_t)CEIL_DIV(dst->ne[0], 32);
|
|
elements[1] = (uint32_t)CEIL_DIV(dst->ne[1], 32);
|
|
elements[2] = (uint32_t)(dst->ne[2]*dst->ne[3]);
|
|
elements[0] = std::min(elements[0], ctx->device->properties.limits.maxComputeWorkGroupCount[0]);
|
|
elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
|
|
elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
|
|
}
|
|
} break;
|
|
case GGML_OP_ADD_ID:
|
|
{
|
|
elements = { (uint32_t)ne01, (uint32_t)ne02, 1 };
|
|
} break;
|
|
case GGML_OP_SET_ROWS:
|
|
{
|
|
uint32_t ne = ggml_nelements(src0);
|
|
if (ggml_is_quantized(dst->type)) {
|
|
// quants run 32 threads each doing QUANT_K elements
|
|
ne = CEIL_DIV(ne, 32 * ggml_blck_size(dst->type));
|
|
} else {
|
|
// scalar types do one element per thread, running 512 threads
|
|
ne = CEIL_DIV(ne, 512);
|
|
}
|
|
if (ne > 262144) {
|
|
elements = { 512, 512, CEIL_DIV(ne, 262144) };
|
|
} else if (ne > 512) {
|
|
elements = { 512, CEIL_DIV(ne, 512), 1 };
|
|
} else {
|
|
elements = { ne, 1, 1 };
|
|
}
|
|
}
|
|
break;
|
|
case GGML_OP_SSM_CONV:
|
|
{
|
|
const uint32_t nr = src0->ne[1];
|
|
const uint32_t n_t = dst->ne[1];
|
|
const uint32_t n_s = dst->ne[2];
|
|
elements = { nr, n_t, n_s };
|
|
}
|
|
break;
|
|
default:
|
|
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
|
|
break;
|
|
}
|
|
|
|
if (op == GGML_OP_ADD || op == GGML_OP_RMS_NORM) {
|
|
vk_subbuffer a_buf = src0_buf;
|
|
if (ctx->do_add_rms_partials) {
|
|
a_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_add_rms_partials, ctx->prealloc_size_add_rms_partials_offset);
|
|
}
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
{ src0_buf, src1_buf, dst_buf, a_buf }, pc, elements);
|
|
} else if (op == GGML_OP_GLU) {
|
|
// Empty src1 is possible in glu, but the shader needs a buffer
|
|
vk_subbuffer subbuf1 = use_src1 ? src1_buf : src0_buf;
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc, elements);
|
|
} else if (op == GGML_OP_SOFT_MAX) {
|
|
// Empty src1 and src2 is possible in soft_max, but the shader needs a buffer
|
|
vk_subbuffer subbuf1 = use_src1 ? src1_buf : src0_buf;
|
|
vk_subbuffer subbuf2 = use_src2 ? src2_buf : src0_buf;
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, subbuf2, dst_buf }, pc, elements);
|
|
} else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) {
|
|
// Empty src2 and src3 is possible in rope, but the shader needs a buffer
|
|
vk_subbuffer subbuf2 = use_src2 ? src2_buf : src0_buf;
|
|
vk_subbuffer subbuf3 = use_src3 ? src3_buf : src0_buf;
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, subbuf2, dst_buf, subbuf3 }, pc, elements);
|
|
} else if (op == GGML_OP_IM2COL || op == GGML_OP_IM2COL_3D) {
|
|
if (ctx->device->shader_int64 && ctx->device->buffer_device_address) {
|
|
// buffer device address path doesn't use dst buffer
|
|
dst_buf.size = 1;
|
|
}
|
|
// im2col uses only src1 and dst buffers
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src1_buf, dst_buf }, pc, elements);
|
|
} else if (op == GGML_OP_COUNT_EQUAL) {
|
|
// count_equal assumes that destination buffer is initialized with zeroes
|
|
ggml_vk_buffer_memset_async(subctx, dst_buf.buffer, dst_buf.offset, 0, dst_buf.size);
|
|
ggml_vk_sync_buffers(ctx, subctx);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, dst_buf }, pc, elements);
|
|
} else if (op == GGML_OP_OPT_STEP_SGD) {
|
|
// OPT_STEP_SGD works on src0, it does not need dst
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf }, pc, elements);
|
|
} else if (use_src3) {
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf, src3_buf, dst_buf }, pc, elements);
|
|
} else if (use_src2) {
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf, dst_buf }, pc, elements);
|
|
} else if (use_src1) {
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, dst_buf }, pc, elements);
|
|
} else {
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, dst_buf }, pc, elements);
|
|
}
|
|
}
|
|
|
|
void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GET_ROWS, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
0,
|
|
0.0f, 0.0f, 0,
|
|
});
|
|
}
|
|
|
|
void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
int nb1 = dst->op_params[0] / src0_type_size; // 4 bytes of float32
|
|
int nb2 = dst->op_params[1] / src0_type_size; // 4 bytes of float32
|
|
int nb3 = dst->op_params[2] / src0_type_size; // 4 bytes of float32
|
|
int offset = dst->op_params[3] / src0_type_size; // offset in bytes
|
|
|
|
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, dst->op, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)nb3,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)nb3,
|
|
0,
|
|
0.0f, 0.0f, offset,
|
|
});
|
|
}
|
|
|
|
void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
|
|
const ggml_tensor *first_node = cgraph->nodes[node_idx];
|
|
const ggml_tensor *dst = cgraph->nodes[node_idx + ctx->num_additional_fused_ops];
|
|
|
|
// Make a list of all the tensors used by the op.
|
|
// Last element of the list is the dest tensor.
|
|
const ggml_tensor *tensors[MAX_PARAMETER_COUNT];
|
|
uint32_t num_srcs = ctx->num_additional_fused_ops + 2;
|
|
uint32_t num_tensors = num_srcs + 1;
|
|
GGML_ASSERT(num_tensors + ctx->do_add_rms_partials <= MAX_PARAMETER_COUNT);
|
|
|
|
tensors[0] = first_node->src[0];
|
|
tensors[1] = first_node->src[1];
|
|
for (int32_t i = 0; i < ctx->num_additional_fused_ops; ++i) {
|
|
// check whether the previous result is src[0] or src[1]
|
|
if (cgraph->nodes[node_idx + i] == cgraph->nodes[node_idx + i + 1]->src[0]) {
|
|
tensors[i+2] = cgraph->nodes[node_idx + i + 1]->src[1];
|
|
} else {
|
|
tensors[i+2] = cgraph->nodes[node_idx + i + 1]->src[0];
|
|
}
|
|
}
|
|
tensors[num_srcs] = dst;
|
|
|
|
vk_op_multi_add_push_constants pc;
|
|
pc.ne20 = (uint32_t)dst->ne[0];
|
|
pc.ne21 = (uint32_t)dst->ne[1];
|
|
pc.ne22 = (uint32_t)dst->ne[2];
|
|
pc.ne23 = (uint32_t)dst->ne[3];
|
|
|
|
for (uint32_t i = 0; i < num_tensors; ++i) {
|
|
const ggml_tensor *t = tensors[i];
|
|
pc.nb[i][0] = (uint32_t)t->nb[0] / sizeof(float);
|
|
pc.nb[i][1] = (uint32_t)t->nb[1] / sizeof(float);
|
|
pc.nb[i][2] = (uint32_t)t->nb[2] / sizeof(float);
|
|
pc.nb[i][3] = (uint32_t)t->nb[3] / sizeof(float);
|
|
}
|
|
pc.rms_partials = ctx->do_add_rms_partials;
|
|
|
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, tensors[0], tensors[1], nullptr, dst, dst->op);
|
|
|
|
if (pipeline == nullptr) {
|
|
std::cerr << "ggml_vulkan: Error: Missing multi_add";
|
|
GGML_ABORT("fatal error");
|
|
}
|
|
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
|
|
ggml_backend_vk_buffer_context * buf_ctx[MAX_PARAMETER_COUNT];
|
|
vk_buffer buf[MAX_PARAMETER_COUNT];
|
|
size_t offset[MAX_PARAMETER_COUNT];
|
|
bool uma[MAX_PARAMETER_COUNT];
|
|
|
|
for (uint32_t i = 0; i < num_tensors; ++i) {
|
|
buf_ctx[i] = (ggml_backend_vk_buffer_context *)tensors[i]->buffer->context;
|
|
buf[i] = nullptr;
|
|
offset[i] = 0;
|
|
uma[i] = false;
|
|
|
|
if (ctx->device->uma) {
|
|
ggml_vk_host_get(ctx->device, tensors[i]->data, buf[i], offset[i]);
|
|
uma[i] = buf[i] != nullptr;
|
|
}
|
|
if (!uma[i]) {
|
|
buf[i] = buf_ctx[i]->dev_buffer;
|
|
offset[i] = vk_tensor_offset(tensors[i]) + tensors[i]->view_offs;
|
|
}
|
|
GGML_ASSERT(buf[i] != nullptr);
|
|
}
|
|
// If any remaining descriptors are unused, just point them at src[0]
|
|
for (uint32_t i = num_tensors; i < MAX_PARAMETER_COUNT; ++i) {
|
|
buf[i] = buf[0];
|
|
offset[i] = 0;
|
|
}
|
|
if (ctx->do_add_rms_partials) {
|
|
buf[num_tensors] = ctx->prealloc_add_rms_partials;
|
|
offset[num_tensors] = ctx->prealloc_size_add_rms_partials_offset;
|
|
}
|
|
|
|
std::array<uint32_t, 3> elements;
|
|
|
|
uint32_t ne = ggml_nelements(dst);
|
|
if (ne > 262144) {
|
|
elements = { 512, 512, CEIL_DIV(ne, 262144) };
|
|
} else if (ne > 512) {
|
|
elements = { 512, CEIL_DIV(ne, 512), 1 };
|
|
} else {
|
|
elements = { ne, 1, 1 };
|
|
}
|
|
|
|
static_assert(MAX_PARAMETER_COUNT == 12);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
{
|
|
ggml_vk_subbuffer(ctx, buf[0], offset[0]),
|
|
ggml_vk_subbuffer(ctx, buf[1], offset[1]),
|
|
ggml_vk_subbuffer(ctx, buf[2], offset[2]),
|
|
ggml_vk_subbuffer(ctx, buf[3], offset[3]),
|
|
ggml_vk_subbuffer(ctx, buf[4], offset[4]),
|
|
ggml_vk_subbuffer(ctx, buf[5], offset[5]),
|
|
ggml_vk_subbuffer(ctx, buf[6], offset[6]),
|
|
ggml_vk_subbuffer(ctx, buf[7], offset[7]),
|
|
ggml_vk_subbuffer(ctx, buf[8], offset[8]),
|
|
ggml_vk_subbuffer(ctx, buf[9], offset[9]),
|
|
ggml_vk_subbuffer(ctx, buf[10], offset[10]),
|
|
ggml_vk_subbuffer(ctx, buf[11], offset[11]),
|
|
}, pc, elements);
|
|
}
|
|
|
|
void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_ADD, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
0,
|
|
0.0f, 0.0f, ctx->do_add_rms_partials,
|
|
});
|
|
}
|
|
|
|
void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SUB, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
0,
|
|
0.0f, 0.0f, 0,
|
|
});
|
|
}
|
|
|
|
void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_MUL, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
0,
|
|
0.0f, 0.0f, 0,
|
|
});
|
|
}
|
|
|
|
void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_DIV, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
0,
|
|
0.0f, 0.0f, 0,
|
|
});
|
|
}
|
|
|
|
void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t src2_type_size = ggml_type_size(src2->type);
|
|
|
|
ggml_vk_op_f32<vk_op_add_id_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_ADD_ID, {
|
|
(uint32_t)dst->ne[0],
|
|
(uint32_t)dst->ne[1],
|
|
(uint32_t)src0->nb[1] / src0_type_size,
|
|
(uint32_t)src0->nb[2] / src0_type_size,
|
|
(uint32_t)src1->nb[1] / src1_type_size,
|
|
(uint32_t)src2->nb[1] / src2_type_size,
|
|
});
|
|
}
|
|
|
|
static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, int version) {
|
|
GGML_ASSERT(version == 6 || version == 7);
|
|
int num_srcs = version == 6 ? 6 : 7;
|
|
|
|
for (int i = 0; i < num_srcs; i++) {
|
|
GGML_ASSERT(!ggml_is_quantized(dst->src[i]->type));
|
|
}
|
|
|
|
GGML_ASSERT(dst->buffer != nullptr);
|
|
|
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, dst->src[0], dst->src[1], dst->src[2], dst, dst->op);
|
|
GGML_ASSERT(pipeline != nullptr);
|
|
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
|
|
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
|
|
vk_subbuffer src_buf[7] = {};
|
|
for (int i = 0; i < num_srcs; i++) {
|
|
src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]);
|
|
}
|
|
|
|
std::array<uint32_t, 3> elements = {
|
|
(uint32_t)(pc.B * pc.H),
|
|
1,
|
|
1
|
|
};
|
|
|
|
if (version == 6) {
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
{src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], dst_buf},
|
|
pc, elements);
|
|
} else if (version == 7) {
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
{src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], src_buf[6], dst_buf},
|
|
pc, elements);
|
|
} else {
|
|
// shouldn't happen
|
|
GGML_ASSERT(false);
|
|
}
|
|
}
|
|
|
|
void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
|
|
const size_t seq_length = dst->src[0]->ne[2];
|
|
const size_t n_embed = dst->ne[0];
|
|
const size_t n_heads = dst->src[0]->ne[1];
|
|
const size_t n_seqs = dst->src[5]->ne[1];
|
|
|
|
ggml_vk_op_f32_wkv(
|
|
ctx, subctx, dst,
|
|
{
|
|
(uint32_t)n_seqs,
|
|
(uint32_t)seq_length,
|
|
(uint32_t)n_embed,
|
|
(uint32_t)n_heads,
|
|
},
|
|
6
|
|
);
|
|
}
|
|
|
|
void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
|
|
const size_t seq_length = dst->src[0]->ne[2];
|
|
const size_t n_embed = dst->ne[0];
|
|
const size_t n_heads = dst->src[0]->ne[1];
|
|
const size_t n_seqs = dst->src[6]->ne[1];
|
|
|
|
ggml_vk_op_f32_wkv(
|
|
ctx, subctx, dst,
|
|
{
|
|
(uint32_t)n_seqs,
|
|
(uint32_t)seq_length,
|
|
(uint32_t)n_embed,
|
|
(uint32_t)n_heads,
|
|
},
|
|
7
|
|
);
|
|
}
|
|
|
|
void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
|
|
const ggml_tensor * src_q = dst->src[0];
|
|
const ggml_tensor * src_v = dst->src[2];
|
|
const ggml_tensor * src_beta = dst->src[4];
|
|
|
|
GGML_ASSERT(dst->buffer != nullptr);
|
|
|
|
const uint32_t S_v = (uint32_t)src_v->ne[0];
|
|
const uint32_t H = (uint32_t)src_v->ne[1];
|
|
const uint32_t n_tokens = (uint32_t)src_v->ne[2];
|
|
const uint32_t n_seqs = (uint32_t)src_v->ne[3];
|
|
|
|
// K (snapshot slot count) is an op param; state holds s0 only [S_v, S_v, H, n_seqs].
|
|
const uint32_t K = (uint32_t)ggml_get_op_params_i32(dst, 0);
|
|
|
|
const uint32_t s_off = S_v * H * n_tokens * n_seqs;
|
|
|
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, dst->src[0], dst->src[1], dst->src[2], dst, dst->op);
|
|
GGML_ASSERT(pipeline != nullptr);
|
|
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
|
|
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
|
|
vk_subbuffer src_buf[6] = {};
|
|
for (int i = 0; i < 6; i++) {
|
|
src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]);
|
|
}
|
|
|
|
const uint32_t sq1 = (uint32_t)(src_q->nb[1] / sizeof(float));
|
|
const uint32_t sq2 = (uint32_t)(src_q->nb[2] / sizeof(float));
|
|
const uint32_t sq3 = (uint32_t)(src_q->nb[3] / sizeof(float));
|
|
const uint32_t sv1 = (uint32_t)(src_v->nb[1] / sizeof(float));
|
|
const uint32_t sv2 = (uint32_t)(src_v->nb[2] / sizeof(float));
|
|
const uint32_t sv3 = (uint32_t)(src_v->nb[3] / sizeof(float));
|
|
const uint32_t sb1 = (uint32_t)(src_beta->nb[1] / sizeof(float));
|
|
const uint32_t sb2 = (uint32_t)(src_beta->nb[2] / sizeof(float));
|
|
const uint32_t sb3 = (uint32_t)(src_beta->nb[3] / sizeof(float));
|
|
|
|
const uint32_t neq1 = (uint32_t)src_q->ne[1];
|
|
const uint32_t rq3 = (uint32_t)(src_v->ne[3] / src_q->ne[3]);
|
|
|
|
const float scale = 1.0f / sqrtf((float)S_v);
|
|
const vk_op_gated_delta_net_push_constants pc = {
|
|
H, n_tokens, n_seqs, s_off,
|
|
sq1, sq2, sq3,
|
|
sv1, sv2, sv3,
|
|
sb1, sb2, sb3,
|
|
neq1, rq3,
|
|
scale,
|
|
K
|
|
};
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
{src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], dst_buf},
|
|
pc, { H, n_seqs, S_v });
|
|
}
|
|
|
|
void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
|
|
const ggml_tensor * src0 = dst->src[0];
|
|
const ggml_tensor * src1 = dst->src[1];
|
|
const ggml_tensor * src2 = dst->src[2];
|
|
const ggml_tensor * src3 = dst->src[3];
|
|
const ggml_tensor * src4 = dst->src[4];
|
|
const ggml_tensor * src5 = dst->src[5];
|
|
|
|
GGML_ASSERT(dst->buffer != nullptr);
|
|
|
|
const uint32_t head_dim = src0->ne[1];
|
|
const uint32_t n_head = src1->ne[1];
|
|
const uint32_t n_group = src4->ne[1];
|
|
const uint32_t n_tok = src1->ne[2];
|
|
const uint32_t n_seq = src1->ne[3];
|
|
|
|
bool is_mamba2 = (src3->nb[1] == sizeof(float));
|
|
GGML_ASSERT(is_mamba2);
|
|
|
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, dst->op);
|
|
GGML_ASSERT(pipeline != nullptr);
|
|
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
|
|
const int64_t s_off = ggml_nelements(src1) * sizeof(float);
|
|
|
|
const vk_op_ssm_scan_push_constants pc = {
|
|
(uint32_t)src0->nb[2], (uint32_t)src0->nb[3],
|
|
(uint32_t)src1->nb[2], (uint32_t)src1->nb[3],
|
|
(uint32_t)src2->nb[1], (uint32_t)src2->nb[2],
|
|
(uint32_t)src3->nb[1],
|
|
(uint32_t)src4->nb[2], (uint32_t)src4->nb[3],
|
|
(uint32_t)src5->nb[2], (uint32_t)src5->nb[3],
|
|
(uint32_t)s_off,
|
|
n_head, head_dim, n_group, n_tok
|
|
};
|
|
|
|
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
|
|
vk_subbuffer src_buf[7] = {};
|
|
for (int i = 0; i < 7 && dst->src[i] != nullptr; i++) {
|
|
src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]);
|
|
}
|
|
|
|
std::array<uint32_t, 3> elements;
|
|
|
|
const uint32_t d_state = src0->ne[0];
|
|
uint32_t num_subgroups = d_state / ctx->device->subgroup_size;
|
|
const uint32_t num_workgroups_x = CEIL_DIV(n_head * head_dim, num_subgroups);
|
|
const uint32_t num_workgroups_y = n_seq;
|
|
elements = { num_workgroups_x, num_workgroups_y, 1 };
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
{src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], src_buf[6], dst_buf},
|
|
pc, elements);
|
|
}
|
|
|
|
void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
|
|
ggml_tensor * conv = cgraph->nodes[node_idx];
|
|
const ggml_tensor * src0 = conv->src[0];
|
|
const ggml_tensor * src1 = conv->src[1];
|
|
|
|
// Pick the destination tensor (last node in the fused chain) and the optional bias.
|
|
// Fusion modes: 0 = ssm_conv, 1 = ssm_conv+silu, 2 = ssm_conv+add(bias)+silu.
|
|
ggml_tensor * dst = conv;
|
|
const ggml_tensor * bias = nullptr;
|
|
|
|
if (ctx->num_additional_fused_ops == 1) {
|
|
dst = cgraph->nodes[node_idx + 1]; // silu
|
|
} else if (ctx->num_additional_fused_ops == 2) {
|
|
ggml_tensor * add = cgraph->nodes[node_idx + 1];
|
|
bias = (add->src[0] == conv) ? add->src[1] : add->src[0];
|
|
dst = cgraph->nodes[node_idx + 2]; // silu
|
|
}
|
|
|
|
// The shader always declares 4 bindings; bind src0 as a dummy when bias isn't fused.
|
|
const ggml_tensor * src2 = bias ? bias : src0;
|
|
|
|
ggml_vk_op_f32<vk_op_ssm_conv_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SSM_CONV, {
|
|
(uint32_t)src0->nb[1], (uint32_t)src0->nb[2],
|
|
(uint32_t)src1->nb[1],
|
|
(uint32_t)dst->nb[0], (uint32_t)dst->nb[1], (uint32_t)dst->nb[2],
|
|
(uint32_t)src1->ne[0],
|
|
(uint32_t)src0->ne[0],
|
|
(uint32_t)src0->ne[1],
|
|
(uint32_t)dst->ne[1],
|
|
(uint32_t)dst->ne[2],
|
|
});
|
|
}
|
|
|
|
static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_push_constants&& pc) {
|
|
const ggml_tensor * x = dst->src[0];
|
|
const ggml_tensor * g = dst->src[1];
|
|
const ggml_tensor * gm = dst->src[2];
|
|
const ggml_tensor * gv = dst->src[3];
|
|
const ggml_tensor * p = dst->src[4];
|
|
|
|
GGML_ASSERT(x->type == GGML_TYPE_F32);
|
|
GGML_ASSERT(g->type == GGML_TYPE_F32);
|
|
GGML_ASSERT(gm->type == GGML_TYPE_F32);
|
|
GGML_ASSERT(gv->type == GGML_TYPE_F32);
|
|
GGML_ASSERT(p->type == GGML_TYPE_F32);
|
|
GGML_ASSERT(dst->buffer != nullptr);
|
|
GGML_ASSERT(ggml_is_contiguous(x));
|
|
GGML_ASSERT(ggml_is_contiguous(g));
|
|
GGML_ASSERT(ggml_is_contiguous(gm));
|
|
GGML_ASSERT(ggml_is_contiguous(gv));
|
|
GGML_ASSERT(ggml_is_contiguous(p));
|
|
GGML_ASSERT(ggml_are_same_shape(x, g));
|
|
GGML_ASSERT(ggml_are_same_shape(x, gm));
|
|
GGML_ASSERT(ggml_are_same_shape(x, gv));
|
|
GGML_ASSERT(ggml_nelements(p) == 7);
|
|
|
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, g, gm, gv, dst, GGML_OP_OPT_STEP_ADAMW);
|
|
GGML_ASSERT(pipeline != nullptr);
|
|
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
|
|
vk_subbuffer x_buf = ggml_vk_tensor_subbuffer(ctx, x);
|
|
vk_subbuffer g_buf = ggml_vk_tensor_subbuffer(ctx, g);
|
|
vk_subbuffer gm_buf = ggml_vk_tensor_subbuffer(ctx, gm);
|
|
vk_subbuffer gv_buf = ggml_vk_tensor_subbuffer(ctx, gv);
|
|
vk_subbuffer p_buf = ggml_vk_tensor_subbuffer(ctx, p);
|
|
|
|
std::array<uint32_t, 3> elements = { (uint32_t)ggml_nelements(x), 1, 1 };
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
{x_buf, g_buf, gm_buf, gv_buf, p_buf},
|
|
pc, elements);
|
|
}
|
|
|
|
void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
|
|
const size_t n = ggml_nelements(dst->src[0]);
|
|
|
|
ggml_vk_op_f32_opt_step_adamw(
|
|
ctx, subctx, dst,
|
|
{ (uint32_t)n, 0, 0.0f, 0.0f, 0.0f, 0.0f }
|
|
);
|
|
}
|
|
|
|
void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
const size_t n = ggml_nelements(dst->src[0]);
|
|
|
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f, 0.0f, 0.0f });
|
|
}
|
|
|
|
void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
int * op_params = (int *)dst->op_params;
|
|
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONCAT, {
|
|
(uint32_t)ggml_nelements(dst),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
0,
|
|
0.0f, 0.0f, op_params[0],
|
|
});
|
|
}
|
|
|
|
void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0);
|
|
|
|
GGML_TENSOR_UNARY_OP_LOCALS
|
|
|
|
float sf0 = (float)ne0 / ne00;
|
|
float sf1 = (float)ne1 / ne01;
|
|
float sf2 = (float)ne2 / ne02;
|
|
float sf3 = (float)ne3 / ne03;
|
|
float pixel_offset = 0.5f;
|
|
|
|
if (mode & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
|
sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
|
|
sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
|
|
pixel_offset = 0.0f;
|
|
}
|
|
|
|
ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
|
|
(uint32_t)ggml_nelements(dst), 0, 0,
|
|
(uint32_t)ne00, (uint32_t)ne01,
|
|
(uint32_t)nb00 / src0_type_size, (uint32_t)nb01 / src0_type_size, (uint32_t)nb02 / src0_type_size, (uint32_t)nb03 / src0_type_size,
|
|
(uint32_t)ne0, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3,
|
|
sf0, sf1, sf2, sf3, pixel_offset
|
|
});
|
|
}
|
|
|
|
void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
|
p.param1 = ggml_get_op_params_f32(dst, 0);
|
|
p.param2 = ggml_get_op_params_f32(dst, 1);
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst));
|
|
}
|
|
|
|
void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQRT, vk_op_unary_push_constants_init(src0, dst));
|
|
}
|
|
|
|
void ggml_vk_add1(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_ADD1, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
0,
|
|
0.0f, 0.0f, 0,
|
|
});
|
|
}
|
|
|
|
void ggml_vk_arange(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
|
|
VK_LOG_DEBUG("ggml_vk_arange(dst=" << dst << ", ne=" << ggml_nelements(dst) << ")");
|
|
|
|
vk_op_push_constants pc = {
|
|
(uint32_t)ggml_nelements(dst),
|
|
1,
|
|
ggml_get_op_params_f32(dst, 0),
|
|
ggml_get_op_params_f32(dst, 2),
|
|
0.0f, 0.0f,
|
|
};
|
|
|
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, dst, GGML_OP_ARANGE);
|
|
GGML_ASSERT(pipeline != nullptr);
|
|
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, false);
|
|
|
|
std::array<uint32_t, 3> elements = { (uint32_t)ggml_nelements(dst), 1, 1 };
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { dst_buf }, pc, elements);
|
|
}
|
|
|
|
void ggml_vk_fill(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
|
|
VK_LOG_DEBUG("ggml_vk_fill(dst=" << dst << ", ne=" << ggml_nelements(dst) << ")");
|
|
|
|
vk_op_push_constants pc = {
|
|
(uint32_t)ggml_nelements(dst),
|
|
1,
|
|
ggml_get_op_params_f32(dst, 0),
|
|
0.0f,
|
|
0.0f, 0.0f,
|
|
};
|
|
|
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, dst, GGML_OP_FILL);
|
|
GGML_ASSERT(pipeline != nullptr);
|
|
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, false);
|
|
|
|
std::array<uint32_t, 3> elements = { (uint32_t)ggml_nelements(dst), 1, 1 };
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { dst_buf }, pc, elements);
|
|
}
|
|
|
|
void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst));
|
|
}
|
|
|
|
void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst));
|
|
}
|
|
|
|
void ggml_vk_log(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LOG, vk_op_unary_push_constants_init(src0, dst));
|
|
}
|
|
|
|
void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
|
p.param1 = ggml_get_op_params_f32(dst, 0);
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
|
p.param1 = ggml_get_op_params_f32(dst, 0);
|
|
p.param2 = ggml_get_op_params_f32(dst, 1);
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
vk_op_pad_push_constants p = vk_op_pad_push_constants_init(src0, dst);
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
const int32_t s0 = ggml_get_op_params_i32(dst, 0);
|
|
const int32_t s1 = ggml_get_op_params_i32(dst, 1);
|
|
const int32_t s2 = ggml_get_op_params_i32(dst, 2);
|
|
const int32_t s3 = ggml_get_op_params_i32(dst, 3);
|
|
const uint32_t s01_packed = ((s0 + 0x8000) << 16) | (s1 + 0x8000);
|
|
const uint32_t s23_packed = ((s2 + 0x8000) << 16) | (s3 + 0x8000);
|
|
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
|
memcpy(&p.param1, &s01_packed, sizeof(float));
|
|
memcpy(&p.param2, &s23_packed, sizeof(float));
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
uint32_t ne = (uint32_t)ggml_nelements(src0);
|
|
if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
|
|
// Convert from number of logical elements to 2- or 4-byte units.
|
|
ne /= ggml_blck_size(src0->type);
|
|
if ((ggml_type_size(src0->type) % 4) == 0) {
|
|
ne *= ggml_type_size(src0->type) / 4;
|
|
} else {
|
|
ne *= ggml_type_size(src0->type) / 2;
|
|
}
|
|
}
|
|
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne);
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
// Skip empty skip_rows operations. For most ops the empty check at the start
|
|
// of ggml_vk_build_graph is sufficient, but set_rows can have a nonempty dst
|
|
// with empty srcs.
|
|
if (ggml_is_empty(src0) || ggml_is_empty(src1)) {
|
|
return;
|
|
}
|
|
|
|
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SET_ROWS, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
0,
|
|
0.0f, 0.0f, 0,
|
|
});
|
|
}
|
|
|
|
void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f });
|
|
}
|
|
|
|
void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
float * op_params = (float *)dst->op_params;
|
|
|
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f, 0.0f, 0.0f });
|
|
}
|
|
|
|
void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
const int * int_op_params = (const int *)dst->op_params;
|
|
const float * float_op_params = (const float *)dst->op_params;
|
|
|
|
const uint32_t num_groups = int_op_params[0];
|
|
const float eps = float_op_params[1];
|
|
const uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
|
|
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f, 0.0f, 0.0f });
|
|
}
|
|
|
|
static uint32_t ggml_vk_rms_num_partials(ggml_backend_vk_context * ctx, const ggml_tensor *node) {
|
|
const uint32_t ne = (uint32_t)node->ne[0];
|
|
const uint32_t denom = ctx->device->pipeline_add_rms[0][0][0]->wg_denoms[0];
|
|
const uint32_t num_partials = CEIL_DIV(ne, denom);
|
|
return num_partials;
|
|
}
|
|
|
|
uint32_t ggml_vk_rms_partials_size(ggml_backend_vk_context * ctx, const ggml_tensor *node) {
|
|
const uint32_t num_partials = ggml_vk_rms_num_partials(ctx, node);
|
|
const uint32_t num_bytes = ROUNDUP_POW2(num_partials * sizeof(uint32_t), ctx->device->partials_binding_alignment);
|
|
return num_bytes;
|
|
}
|
|
|
|
static vk_op_rope_push_constants ggml_vk_make_rope_constants(const ggml_tensor *dst, const ggml_tensor *src0, const bool has_ff, bool backprop, const uint32_t set_rows_stride) {
|
|
const int n_dims = ((const int32_t *) dst->op_params)[1];
|
|
const int mode = ((const int32_t *) dst->op_params)[2];
|
|
// const int n_ctx = ((const int32_t *) dst->op_params)[3];
|
|
const int n_ctx_orig = ((const int32_t *) dst->op_params)[4];
|
|
const float freq_base = ((const float *) dst->op_params)[5];
|
|
const float freq_scale = ((const float *) dst->op_params)[6];
|
|
const float ext_factor = ((const float *) dst->op_params)[7];
|
|
const float attn_factor = ((const float *) dst->op_params)[8];
|
|
const float beta_fast = ((const float *) dst->op_params)[9];
|
|
const float beta_slow = ((const float *) dst->op_params)[10];
|
|
int sections[4] {};
|
|
if (mode & GGML_ROPE_TYPE_MROPE) {
|
|
memcpy(sections, (const int32_t *) dst->op_params + 11, sizeof(int)*4);
|
|
}
|
|
|
|
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
|
|
|
|
float corr_dims[2];
|
|
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
|
|
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
|
|
|
uint32_t nb01 = src0->nb[1] / ggml_type_size(src0->type);
|
|
uint32_t nb02 = src0->nb[2] / ggml_type_size(src0->type);
|
|
uint32_t nb03 = src0->nb[3] / ggml_type_size(src0->type);
|
|
|
|
uint32_t nb11 = dst->nb[1] / ggml_type_size(dst->type);
|
|
uint32_t nb12 = dst->nb[2] / ggml_type_size(dst->type);
|
|
uint32_t nb13 = dst->nb[3] / ggml_type_size(dst->type);
|
|
|
|
vk_op_rope_push_constants rope {
|
|
(uint32_t)mode, (uint32_t)ggml_nrows(src0), (uint32_t)n_dims, freq_scale,
|
|
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale, has_ff,
|
|
{ sections[0], sections[1], sections[2], sections[3] }, is_imrope, backprop, set_rows_stride,
|
|
|
|
(uint32_t)src0->ne[0],
|
|
(uint32_t)src0->ne[1],
|
|
(uint32_t)src0->ne[2],
|
|
nb01, nb02, nb03,
|
|
nb11, nb12, nb13,
|
|
0, 0, // a_offset, d_offset filled in by init_pushconst_tensor_offsets
|
|
};
|
|
|
|
return rope;
|
|
}
|
|
|
|
void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, float * op_params) {
|
|
ggml_tensor * dst;
|
|
const ggml_tensor * src0;
|
|
const ggml_tensor * src1;
|
|
|
|
if (ctx->num_additional_fused_ops > 0) {
|
|
// fused rms_norm + mul
|
|
ggml_tensor *mul = cgraph->nodes[node_idx + 1];
|
|
ggml_tensor *other_src = mul->src[0] == cgraph->nodes[node_idx + 0] ? mul->src[1] : mul->src[0];
|
|
dst = mul;
|
|
src0 = cgraph->nodes[node_idx]->src[0];
|
|
src1 = other_src;
|
|
} else {
|
|
dst = cgraph->nodes[node_idx];
|
|
src0 = src1 = dst->src[0];
|
|
}
|
|
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
uint32_t param3 = ctx->do_add_rms_partials ? ggml_vk_rms_num_partials(ctx, dst) : 0;
|
|
|
|
vk_op_binary_push_constants bin {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
0,
|
|
op_params[0], 0.0f, (int32_t)param3,
|
|
};
|
|
|
|
// more than one fused op means rms_norm+mul+rope
|
|
if (ctx->num_additional_fused_ops > 1) {
|
|
static constexpr uint32_t max_tensors = 7;
|
|
const ggml_tensor *tensors[max_tensors] {};
|
|
|
|
ggml_tensor *rms = cgraph->nodes[node_idx + 0];
|
|
ggml_tensor *mul = cgraph->nodes[node_idx + 1];
|
|
ggml_tensor *rope = cgraph->nodes[node_idx + 2];
|
|
|
|
ggml_tensor *other_src = mul->src[0] == rms ? mul->src[1] : mul->src[0];
|
|
|
|
bool do_set_rows = ctx->num_additional_fused_ops == 4;
|
|
|
|
tensors[0] = rms->src[0];
|
|
tensors[1] = other_src;
|
|
tensors[2] = mul;
|
|
tensors[3] = rope->src[1]; // pos
|
|
tensors[4] = rope->src[2]; // ff
|
|
tensors[5] = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; // dst
|
|
tensors[6] = do_set_rows ? tensors[5]->src[1] : nullptr;
|
|
const uint32_t set_rows_stride = do_set_rows ? tensors[5]->nb[1] / ggml_type_size(tensors[5]->type) : 0;
|
|
|
|
vk_op_rms_norm_mul_rope_push_constants pc;
|
|
pc.bin = bin;
|
|
pc.rope = ggml_vk_make_rope_constants(rope, rope->src[0], tensors[4] != nullptr, false, set_rows_stride);
|
|
|
|
vk_pipeline pipeline = tensors[5]->type == GGML_TYPE_F16 ? ctx->device->pipeline_rms_norm_mul_rope_f32_f16 : ctx->device->pipeline_rms_norm_mul_rope_f32_f32;
|
|
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
|
|
ggml_backend_vk_buffer_context * buf_ctx[max_tensors];
|
|
vk_buffer buf[max_tensors];
|
|
size_t offset[max_tensors];
|
|
bool uma[max_tensors];
|
|
|
|
for (uint32_t i = 0; i < max_tensors; ++i) {
|
|
if (!tensors[i]) {
|
|
// If any remaining descriptors are unused, just point them at src[0]
|
|
buf[i] = buf[0];
|
|
offset[i] = 0;
|
|
continue;
|
|
}
|
|
buf_ctx[i] = (ggml_backend_vk_buffer_context *)tensors[i]->buffer->context;
|
|
buf[i] = nullptr;
|
|
offset[i] = 0;
|
|
uma[i] = false;
|
|
|
|
if (ctx->device->uma) {
|
|
ggml_vk_host_get(ctx->device, tensors[i]->data, buf[i], offset[i]);
|
|
uma[i] = buf[i] != nullptr;
|
|
}
|
|
if (!uma[i]) {
|
|
buf[i] = buf_ctx[i]->dev_buffer;
|
|
offset[i] = vk_tensor_offset(tensors[i]) + tensors[i]->view_offs;
|
|
}
|
|
GGML_ASSERT(buf[i] != nullptr);
|
|
}
|
|
|
|
// a_offset is unused (the fused path reads from shared memory), but the rope/set_rows dst can be misaligned.
|
|
// Round the binding offset down to the storage buffer alignment; the in-element shift goes in pc.rope.d_offset.
|
|
pc.rope.d_offset = get_misalign_bytes(ctx, tensors[5]) / ggml_type_size(tensors[5]->type);
|
|
offset[5] &= ~(size_t(ctx->device->properties.limits.minStorageBufferOffsetAlignment) - 1);
|
|
|
|
std::array<uint32_t, 3> elements;
|
|
elements = { (uint32_t)rms->src[0]->ne[1], (uint32_t)rms->src[0]->ne[2], (uint32_t)rms->src[0]->ne[3] };
|
|
|
|
static_assert(max_tensors == 7);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
{
|
|
ggml_vk_subbuffer(ctx, buf[0], offset[0]),
|
|
ggml_vk_subbuffer(ctx, buf[1], offset[1]),
|
|
ggml_vk_subbuffer(ctx, buf[2], offset[2]),
|
|
ggml_vk_subbuffer(ctx, buf[3], offset[3]),
|
|
ggml_vk_subbuffer(ctx, buf[4], offset[4]),
|
|
ggml_vk_subbuffer(ctx, buf[5], offset[5]),
|
|
ggml_vk_subbuffer(ctx, buf[6], offset[6]),
|
|
}, pc, elements);
|
|
} else {
|
|
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM, std::move(bin));
|
|
}
|
|
|
|
if (ctx->do_add_rms_partials_offset_calculation) {
|
|
ctx->prealloc_size_add_rms_partials_offset += ggml_vk_rms_partials_size(ctx, src0);
|
|
ctx->do_add_rms_partials = false;
|
|
ctx->do_add_rms_partials_offset_calculation = false;
|
|
}
|
|
}
|
|
|
|
void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
float * op_params = (float *)dst->op_params;
|
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f, 0.0f, 0.0f });
|
|
}
|
|
|
|
void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
const float * op_params = (const float *)dst->op_params;
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
|
p.param1 = op_params[0];
|
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, vk_op_unary_push_constants_init(src0, dst));
|
|
}
|
|
|
|
void ggml_vk_xielu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
float * op_params = (float *)dst->op_params;
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
|
p.param1 = op_params[1];
|
|
p.param2 = op_params[2];
|
|
p.param3 = op_params[3];
|
|
p.param4 = op_params[4];
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
const float * op_params_f = (const float *)dst->op_params;
|
|
|
|
const bool swapped = (bool)dst->op_params[1];
|
|
const bool split = src1 != nullptr;
|
|
const float alpha = op_params_f[2];
|
|
const float limit = op_params_f[3];
|
|
|
|
if (!split) {
|
|
GGML_ASSERT(src0->ne[0] / 2 == dst->ne[0]);
|
|
} else {
|
|
GGML_ASSERT(src0->ne[0] == src1->ne[0]);
|
|
GGML_ASSERT(src0->ne[0] == dst->ne[0]);
|
|
GGML_ASSERT(src0->type == src1->type);
|
|
}
|
|
|
|
const uint32_t mode = split ? 2 : (swapped ? 1 : 0);
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
const uint32_t src1_type_size = split ? ggml_type_size(src1->type) : src0_type_size;
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
ggml_vk_op_f32<vk_op_glu_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GLU,
|
|
{
|
|
(uint32_t)ggml_nelements(dst),
|
|
(uint32_t)src0->ne[0],
|
|
(uint32_t)dst->ne[0],
|
|
mode,
|
|
alpha,
|
|
limit,
|
|
(uint32_t)(src0->nb[0] / src0_type_size),
|
|
(uint32_t)(src0->nb[1] / src0_type_size),
|
|
(uint32_t)(src0->nb[2] / src0_type_size),
|
|
(uint32_t)(src0->nb[3] / src0_type_size),
|
|
(uint32_t)((split ? src1->nb[0] : src0->nb[0]) / src1_type_size),
|
|
(uint32_t)((split ? src1->nb[1] : src0->nb[1]) / src1_type_size),
|
|
(uint32_t)((split ? src1->nb[2] : src0->nb[2]) / src1_type_size),
|
|
(uint32_t)((split ? src1->nb[3] : src0->nb[3]) / src1_type_size),
|
|
(uint32_t)(dst->nb[0] / dst_type_size),
|
|
(uint32_t)(dst->nb[1] / dst_type_size),
|
|
(uint32_t)(dst->nb[2] / dst_type_size),
|
|
(uint32_t)(dst->nb[3] / dst_type_size),
|
|
(uint32_t)dst->ne[1],
|
|
(uint32_t)dst->ne[2],
|
|
0,
|
|
0, 0, 0, 0, 0, 0,
|
|
});
|
|
}
|
|
|
|
void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
int32_t * op_params = (int32_t *)dst->op_params;
|
|
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
|
}
|
|
|
|
void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
float * op_params = (float *)dst->op_params;
|
|
|
|
float scale = op_params[0];
|
|
float max_bias = op_params[1];
|
|
|
|
const uint32_t ncols = (uint32_t)src0->ne[0];
|
|
const uint32_t nrows_x = (uint32_t)ggml_nrows(src0);
|
|
const uint32_t nrows_y = (uint32_t)src0->ne[1];
|
|
|
|
const uint32_t ne12 = src1 ? (uint32_t)(src1->ne[2]) : 0u;
|
|
const uint32_t ne13 = src1 ? (uint32_t)(src1->ne[3]) : 0u;
|
|
const uint32_t nb11 = src1 ? (uint32_t)(src1->nb[1] / src1->nb[0]) : 0u;
|
|
const uint32_t nb12 = src1 ? (uint32_t)(src1->nb[2] / src1->nb[0]) : 0u;
|
|
const uint32_t nb13 = src1 ? (uint32_t)(src1->nb[3] / src1->nb[0]) : 0u;
|
|
|
|
const uint32_t n_head_kv = src0->ne[2];
|
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
|
|
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
|
|
|
vk_op_soft_max_push_constants pc {
|
|
ncols,
|
|
src1 != nullptr ? nrows_y : (uint32_t)0,
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
|
|
ne12, ne13,
|
|
nb11, nb12, nb13,
|
|
scale, max_bias,
|
|
m0, m1,
|
|
n_head_log2,
|
|
nrows_x,
|
|
src2 != nullptr
|
|
};
|
|
|
|
if (ncols <= 16384) {
|
|
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, std::move(pc));
|
|
} else {
|
|
|
|
vk_subbuffer buf_a = ggml_vk_tensor_subbuffer(ctx, src0);
|
|
vk_subbuffer buf_b = src1 ? ggml_vk_tensor_subbuffer(ctx, src1) : buf_a;
|
|
vk_subbuffer buf_c = src2 ? ggml_vk_tensor_subbuffer(ctx, src2) : buf_a;
|
|
vk_subbuffer buf_d = ggml_vk_tensor_subbuffer(ctx, dst);
|
|
|
|
uint32_t elems_per_wg = 128 * 4;
|
|
uint32_t num_wgs = CEIL_DIV(ncols, elems_per_wg);
|
|
size_t tmp_size = num_wgs * nrows_x * sizeof(float);
|
|
|
|
if (ctx->prealloc_size_x < tmp_size) {
|
|
ctx->prealloc_size_x = tmp_size;
|
|
ggml_vk_preallocate_buffers(ctx, subctx);
|
|
}
|
|
if (ctx->prealloc_size_y < tmp_size) {
|
|
ctx->prealloc_size_y = tmp_size;
|
|
ggml_vk_preallocate_buffers(ctx, subctx);
|
|
}
|
|
if (ctx->prealloc_x_need_sync || ctx->prealloc_y_need_sync) {
|
|
ggml_vk_sync_buffers(ctx, subctx);
|
|
}
|
|
|
|
vk_subbuffer buf_x = { ctx->prealloc_x, 0, tmp_size };
|
|
vk_subbuffer buf_y = { ctx->prealloc_y, 0, tmp_size };
|
|
|
|
std::array<uint32_t, 3> elements = { num_wgs, nrows_x, 1 };
|
|
|
|
vk_pipeline pipeline1 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large1_f32_f16 : ctx->device->pipeline_soft_max_large1_f32;
|
|
vk_pipeline pipeline2 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large2_f32_f16 : ctx->device->pipeline_soft_max_large2_f32;
|
|
vk_pipeline pipeline3 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large3_f32_f16 : ctx->device->pipeline_soft_max_large3_f32;
|
|
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1);
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1);
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline3, 1);
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
|
|
ggml_vk_sync_buffers(ctx, subctx);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
|
|
ggml_vk_sync_buffers(ctx, subctx);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline3, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
|
|
|
|
ctx->prealloc_x_need_sync = true;
|
|
ctx->prealloc_y_need_sync = true;
|
|
}
|
|
}
|
|
|
|
void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
float * op_params = (float *)dst->op_params;
|
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1], 0.0f, 0.0f });
|
|
}
|
|
|
|
void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
|
|
topk_moe_mode mode = ctx->fused_topk_moe_mode;
|
|
ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0];
|
|
ggml_tensor * bias = (mode == TOPK_MOE_SIGMOID_NORM_BIAS) ? cgraph->nodes[node_idx + 2]->src[1] : logits;
|
|
ggml_tensor * weights = cgraph->nodes[node_idx + ctx->num_additional_fused_ops];
|
|
ggml_tensor * ids = (mode == TOPK_MOE_SIGMOID_NORM_BIAS) ? cgraph->nodes[node_idx + 4] :
|
|
(mode == TOPK_MOE_LATE_SOFTMAX) ? cgraph->nodes[node_idx + 1] :
|
|
cgraph->nodes[node_idx + 3];
|
|
|
|
GGML_ASSERT(logits->type == GGML_TYPE_F32);
|
|
GGML_ASSERT(bias->type == GGML_TYPE_F32);
|
|
GGML_ASSERT(weights->type == GGML_TYPE_F32);
|
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
|
|
|
const int n_experts = logits->ne[0];
|
|
const int n_rows = logits->ne[1];
|
|
const int n_expert_used = weights->ne[1];
|
|
|
|
GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts);
|
|
|
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, cgraph->nodes[node_idx], GGML_OP_SOFT_MAX);
|
|
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
|
|
vk_subbuffer logits_buf = ggml_vk_tensor_subbuffer(ctx, logits);
|
|
vk_subbuffer bias_buf = ggml_vk_tensor_subbuffer(ctx, bias);
|
|
vk_subbuffer weights_buf = ggml_vk_tensor_subbuffer(ctx, weights);
|
|
vk_subbuffer ids_buf = ggml_vk_tensor_subbuffer(ctx, ids);
|
|
|
|
vk_op_topk_moe_push_constants pc {};
|
|
pc.n_rows = n_rows;
|
|
pc.n_experts_push = n_experts;
|
|
pc.n_expert_used = n_expert_used;
|
|
pc.clamp_min = -std::numeric_limits<float>::infinity();
|
|
pc.clamp_max = std::numeric_limits<float>::infinity();
|
|
if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
|
|
ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
|
|
GGML_ASSERT(clamp->op == GGML_OP_CLAMP);
|
|
pc.clamp_min = ggml_get_op_params_f32(clamp, 0);
|
|
pc.clamp_max = ggml_get_op_params_f32(clamp, 1);
|
|
}
|
|
if (mode == TOPK_MOE_SIGMOID_NORM_BIAS) {
|
|
ggml_tensor * clamp = cgraph->nodes[node_idx + 8];
|
|
GGML_ASSERT(clamp->op == GGML_OP_CLAMP);
|
|
pc.clamp_min = ggml_get_op_params_f32(clamp, 0);
|
|
pc.clamp_max = ggml_get_op_params_f32(clamp, 1);
|
|
}
|
|
|
|
#define GATING_FUNC_SOFTMAX 0
|
|
#define GATING_FUNC_SIGMOID 1
|
|
#define GATING_FUNC_SOFTMAX_WEIGHT 2
|
|
|
|
pc.gating_func = mode == TOPK_MOE_SIGMOID_NORM_BIAS ? GATING_FUNC_SIGMOID :
|
|
mode == TOPK_MOE_LATE_SOFTMAX ? GATING_FUNC_SOFTMAX_WEIGHT :
|
|
GATING_FUNC_SOFTMAX;
|
|
pc.has_bias = mode == TOPK_MOE_SIGMOID_NORM_BIAS;
|
|
pc.with_norm = mode == TOPK_MOE_EARLY_SOFTMAX_NORM || mode == TOPK_MOE_SIGMOID_NORM_BIAS;
|
|
if (ctx->fused_topk_moe_scale) {
|
|
GGML_ASSERT(weights->op == GGML_OP_SCALE);
|
|
pc.output_scale = ggml_get_op_params_f32(weights, 0);
|
|
pc.output_bias = ggml_get_op_params_f32(weights, 1);
|
|
} else {
|
|
pc.output_scale = 1.0f;
|
|
pc.output_bias = 0.0f;
|
|
}
|
|
|
|
GGML_ASSERT(n_expert_used <= n_experts);
|
|
|
|
const uint32_t rows_per_block = 4;
|
|
std::array<uint32_t, 3> elements = { CEIL_DIV(n_rows, rows_per_block), 1, 1 };
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {logits_buf, bias_buf, weights_buf, ids_buf}, pc, elements);
|
|
}
|
|
|
|
void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop) {
|
|
ggml_tensor * dst = cgraph->nodes[node_idx];
|
|
const ggml_tensor * src0 = dst->src[0];
|
|
const ggml_tensor * src1 = dst->src[1];
|
|
const ggml_tensor * src2 = dst->src[2];
|
|
const ggml_tensor * src3 = nullptr;
|
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
const int mode = ((int32_t *) dst->op_params)[2];
|
|
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
|
const float freq_base = ((float *) dst->op_params)[5];
|
|
const float beta_fast = ((float *) dst->op_params)[9];
|
|
const float beta_slow = ((float *) dst->op_params)[10];
|
|
int sections[4] {};
|
|
if (mode & GGML_ROPE_TYPE_MROPE) {
|
|
memcpy(sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
|
|
}
|
|
|
|
float corr_dims[2];
|
|
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
|
|
|
uint32_t set_rows_stride = 0;
|
|
// Fused rope + view + set_rows passes the set_rows destination stride in set_rows_stride
|
|
// and overrides the dst and sets src3=row_indices
|
|
if (ctx->num_additional_fused_ops > 0) {
|
|
set_rows_stride = cgraph->nodes[node_idx + 2]->nb[1] / ggml_type_size(cgraph->nodes[node_idx + 2]->type);
|
|
src3 = cgraph->nodes[node_idx + 2]->src[1];
|
|
dst = cgraph->nodes[node_idx + 2];
|
|
}
|
|
|
|
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, src3, dst, GGML_OP_ROPE,
|
|
ggml_vk_make_rope_constants(cgraph->nodes[node_idx], src0, src2 != nullptr, backprop, set_rows_stride));
|
|
}
|
|
|
|
void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
const uint32_t * op_params = (const uint32_t *)dst->op_params;
|
|
|
|
uint32_t ncols = src0->ne[0];
|
|
uint32_t nrows = ggml_nrows(src0);
|
|
|
|
uint32_t ncols_pad_log2 = (uint32_t)ceilf(log2f(float(ncols)));
|
|
uint32_t ncolsp2 = 1 << ncols_pad_log2;
|
|
|
|
vk_op_argsort_push_constants pc { ncols, ncolsp2, ncols_pad_log2, nrows, op_params[0], 0, 0, 0, 0, };
|
|
|
|
// Pick the largest workgroup size <= ncolsp2
|
|
uint32_t pipeline_idx = std::min(ncols_pad_log2, num_argsort_pipelines - 1);
|
|
|
|
// Use the "small" argsort shader if the whole sort can be done by a single workgroup.
|
|
bool use_small = ncols_pad_log2 <= ctx->device->max_workgroup_size_log2 &&
|
|
ctx->device->pipeline_argsort_f32[pipeline_idx] != nullptr;
|
|
|
|
vk_pipeline pipeline = use_small ? ctx->device->pipeline_argsort_f32[pipeline_idx]
|
|
: ctx->device->pipeline_argsort_large_f32[pipeline_idx];
|
|
|
|
vk_subbuffer src0_buf = ggml_vk_tensor_subbuffer(ctx, src0);
|
|
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
|
|
vk_subbuffer subbuf1 = dst_buf;
|
|
|
|
// Reserve space for ivec2 per element, with rows padded to a power of two
|
|
if (!use_small) {
|
|
const size_t x_sz = size_t{ncolsp2} * nrows * 2 * sizeof(int);
|
|
|
|
if (ctx->prealloc_size_x < x_sz) {
|
|
ctx->prealloc_size_x = x_sz;
|
|
ggml_vk_preallocate_buffers(ctx, subctx);
|
|
}
|
|
if (ctx->prealloc_x_need_sync) {
|
|
ggml_vk_sync_buffers(ctx, subctx);
|
|
}
|
|
subbuf1 = { ctx->prealloc_x, 0, ctx->prealloc_x->size };
|
|
}
|
|
|
|
std::array<uint32_t, 3> elements;
|
|
|
|
elements[0] = ncolsp2;
|
|
elements[1] = std::min((uint32_t)ggml_nrows(src0), ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
|
|
elements[2] = 1;
|
|
|
|
// First dispatch initializes tmp_idx and does the first N passes where
|
|
// there is only communication between threads in the same workgroup.
|
|
{
|
|
vk_op_argsort_push_constants pc2 = pc;
|
|
pc2.outer_start = 0;
|
|
pc2.outer_end = std::min(ncols_pad_log2, ctx->device->max_workgroup_size_log2);
|
|
pc2.inner_start = 0;
|
|
pc2.inner_end = 100;
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc2, elements);
|
|
}
|
|
if (!use_small) {
|
|
ggml_vk_sync_buffers(ctx, subctx);
|
|
// Loop over outer/inner passes, synchronizing between each pass.
|
|
for (uint32_t outer = ctx->device->max_workgroup_size_log2; outer < ncols_pad_log2; ++outer) {
|
|
for (uint32_t inner = 0; inner < outer + 1; ++inner) {
|
|
vk_op_argsort_push_constants pc2 = pc;
|
|
pc2.outer_start = outer;
|
|
pc2.outer_end = outer + 1;
|
|
pc2.inner_start = inner;
|
|
pc2.inner_end = inner + 1;
|
|
// When the inner idx is large enough, there's only communication
|
|
// within a workgroup. So the remaining inner iterations can all
|
|
// run in the same dispatch.
|
|
if (outer - inner < pipeline_idx) {
|
|
pc2.inner_end = 100;
|
|
inner = outer;
|
|
pipeline = ctx->device->pipeline_argsort_large_f32[pipeline_idx];
|
|
} else {
|
|
// Smaller workgroup empirically seems to perform better
|
|
pipeline = ctx->device->pipeline_argsort_large_f32[pipeline_idx - 2];
|
|
}
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc2, elements);
|
|
ggml_vk_sync_buffers(ctx, subctx);
|
|
}
|
|
}
|
|
ctx->prealloc_x_need_sync = true;
|
|
}
|
|
}
|
|
|
|
void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
uint32_t ncols = src0->ne[0];
|
|
uint32_t nrows = ggml_nrows(src0);
|
|
uint32_t k = dst->ne[0];
|
|
|
|
vk_op_topk_push_constants pc { ncols, ncols, ncols, k, nrows, 0, 0 };
|
|
|
|
if (ctx->prealloc_x_need_sync) {
|
|
ggml_vk_sync_buffers(ctx, subctx);
|
|
}
|
|
|
|
std::array<uint32_t, 3> elements;
|
|
elements[1] = std::min(nrows, ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
|
|
elements[2] = 1;
|
|
|
|
uint32_t num_elements = ncols;
|
|
|
|
// Each iteration reduces a workgroup's worth of elements down to the K
|
|
// largest elements. Repeat until we have the top K elements.
|
|
// Need to do at least one iteration to write out the results.
|
|
bool done_one_iter = false;
|
|
uint32_t dbl_buf_index = 0;
|
|
size_t dbl_buf_size;
|
|
while (num_elements > k || !done_one_iter) {
|
|
|
|
// Prefer going as small as num_topk_pipelines - 3 for perf reasons.
|
|
// But if K is larger, then we need a larger workgroup
|
|
uint32_t max_pipeline = num_topk_pipelines - 1;
|
|
uint32_t preferred_pipeline = std::max(num_topk_pipelines - 3, (uint32_t)log2f(float(k)) + 2);
|
|
max_pipeline = std::min(preferred_pipeline, max_pipeline);
|
|
uint32_t min_pipeline = (uint32_t)log2f(float(k)) + 1;
|
|
// require full subgroup
|
|
min_pipeline = std::max(min_pipeline, ctx->device->subgroup_size_log2);
|
|
|
|
uint32_t pipeline_idx = (uint32_t)ceilf(log2f(float(num_elements)));
|
|
pipeline_idx = std::min(pipeline_idx, max_pipeline);
|
|
pipeline_idx = std::max(pipeline_idx, min_pipeline);
|
|
|
|
if (num_elements > (1u << pipeline_idx)) {
|
|
// If we could finish on this loop iteration (i.e. a single workgroup)
|
|
// then do so. It's better than the overhead of another pass.
|
|
for (uint32_t i = pipeline_idx; i < num_topk_pipelines; ++i) {
|
|
if (num_elements <= (1u << i)) {
|
|
pipeline_idx = i;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
vk_pipeline pipeline = ctx->device->pipeline_topk_f32[pipeline_idx];
|
|
// If the device doesn't support a pipeline this large, use smaller
|
|
while (!pipeline) {
|
|
pipeline_idx--;
|
|
GGML_ASSERT(pipeline_idx >= min_pipeline);
|
|
pipeline = ctx->device->pipeline_topk_f32[pipeline_idx];
|
|
}
|
|
|
|
vk_op_topk_push_constants pc2 = pc;
|
|
pc2.ncols_input = num_elements;
|
|
|
|
// Number of elements remaining after this pass
|
|
uint32_t num_dst_elements = (num_elements / pipeline->wg_denoms[0]) * k + std::min(k, num_elements % pipeline->wg_denoms[0]);
|
|
|
|
pc2.ncols_output = num_dst_elements;
|
|
|
|
if (!done_one_iter) {
|
|
// Reserve space for ivec2 per element, double buffered
|
|
// K per workgroup per row
|
|
dbl_buf_size = num_dst_elements * nrows * 2 * sizeof(int);
|
|
dbl_buf_size = ROUNDUP_POW2(dbl_buf_size, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
|
const size_t x_sz = dbl_buf_size * 2;
|
|
|
|
if (ctx->prealloc_size_x < x_sz) {
|
|
ctx->prealloc_size_x = x_sz;
|
|
ggml_vk_preallocate_buffers(ctx, subctx);
|
|
}
|
|
}
|
|
|
|
vk_subbuffer src_buf;
|
|
vk_subbuffer dst_buf;
|
|
|
|
if (num_elements == ncols) {
|
|
pc2.first_pass = 1;
|
|
src_buf = ggml_vk_tensor_subbuffer(ctx, src0);
|
|
} else {
|
|
src_buf = { ctx->prealloc_x, dbl_buf_index * dbl_buf_size, dbl_buf_size };
|
|
}
|
|
if (num_dst_elements == k) {
|
|
pc2.last_pass = 1;
|
|
dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
|
|
} else {
|
|
dst_buf = { ctx->prealloc_x, (dbl_buf_index ^ 1) * dbl_buf_size, dbl_buf_size };
|
|
}
|
|
|
|
elements[0] = num_elements;
|
|
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src_buf, dst_buf }, pc2, elements);
|
|
num_elements = num_dst_elements;
|
|
dbl_buf_index ^= 1;
|
|
if (num_elements > k) {
|
|
ggml_vk_sync_buffers(ctx, subctx);
|
|
}
|
|
done_one_iter = true;
|
|
}
|
|
ctx->prealloc_x_need_sync = true;
|
|
}
|
|
|
|
void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, ggml_nelements(src0));
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM, p);
|
|
}
|
|
|
|
void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]);
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, p);
|
|
}
|
|
|
|
void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]);
|
|
p.weight = 1.0f / (float)src0->ne[0];
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_MEAN, p);
|
|
}
|
|
|
|
void ggml_vk_cumsum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
vk_op_sum_rows_push_constants pc = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]);
|
|
// Use the single pass shader when the rows are small or there are enough rows to fill the GPU.
|
|
// For fewer, larger rows, use the multipass shader to spread each row across SMs.
|
|
if (dst->ne[0] <= 4096 || ggml_nrows(dst) >= ctx->device->shader_core_count) {
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CUMSUM, pc);
|
|
return;
|
|
}
|
|
|
|
// First pass computes partial sums within a block, and stores the last partial
|
|
// to the temp buffer. Second pass sums the block partials from the temp buffer
|
|
// and adds that to the result of the first pass.
|
|
vk_pipeline pipeline1 = ctx->device->pipeline_cumsum_multipass1_f32;
|
|
vk_pipeline pipeline2 = ctx->device->pipeline_cumsum_multipass2_f32;
|
|
GGML_ASSERT(pipeline1 != nullptr && pipeline2 != nullptr);
|
|
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1);
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1);
|
|
|
|
std::array<uint32_t, 3> elements;
|
|
|
|
elements[0] = dst->ne[0];
|
|
elements[1] = (uint32_t)ggml_nrows(dst);
|
|
elements[2] = 1;
|
|
|
|
size_t temp_size = sizeof(float) * elements[0] * ggml_nrows(dst);
|
|
|
|
if (ctx->prealloc_size_split_k < temp_size) {
|
|
ctx->prealloc_size_split_k = temp_size;
|
|
ggml_vk_preallocate_buffers(ctx, subctx);
|
|
}
|
|
|
|
vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src0);
|
|
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
|
|
vk_subbuffer temp_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0);
|
|
|
|
if (ctx->prealloc_split_k_need_sync) {
|
|
ggml_vk_sync_buffers(ctx, subctx);
|
|
}
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, {src_buf, dst_buf, temp_buf}, pc, elements);
|
|
ggml_vk_sync_buffers(ctx, subctx);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, {src_buf, dst_buf, temp_buf}, pc, elements);
|
|
|
|
ctx->prealloc_split_k_need_sync = true;
|
|
}
|
|
|
|
void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f, 0.0f, 0.0f });
|
|
}
|
|
|
|
void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_COUNT_EQUAL, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f });
|
|
}
|
|
|
|
void ggml_vk_solve_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOLVE_TRI, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
0,
|
|
0.0f, 0.0f, 0,
|
|
});
|
|
}
|
|
|
|
void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
const int32_t s0 = dst->op_params[0];
|
|
const int32_t s1 = dst->op_params[1];
|
|
const int32_t p0 = dst->op_params[2];
|
|
const int32_t p1 = dst->op_params[3];
|
|
const int32_t d0 = dst->op_params[4];
|
|
const int32_t d1 = dst->op_params[5];
|
|
|
|
const bool is_2D = dst->op_params[6] == 1;
|
|
|
|
const uint32_t IC = src1->ne[is_2D ? 2 : 1];
|
|
const uint32_t IH = is_2D ? src1->ne[1] : 1;
|
|
const uint32_t IW = src1->ne[0];
|
|
|
|
const uint32_t KH = is_2D ? src0->ne[1] : 1;
|
|
const uint32_t KW = src0->ne[0];
|
|
|
|
const uint32_t OH = is_2D ? dst->ne[2] : 1;
|
|
const uint32_t OW = dst->ne[1];
|
|
|
|
const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
|
const uint32_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
|
|
|
|
const uint32_t batch = src1->ne[is_2D ? 3 : 2];
|
|
|
|
const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
|
const vk_buffer d_buf = d_buf_ctx->dev_buffer;
|
|
|
|
const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_offset(dst) + dst->view_offs;
|
|
|
|
ggml_vk_op_f32<vk_op_im2col_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL, {
|
|
dst_addr,
|
|
batch_offset, offset_delta,
|
|
IC, IW, IH, OW, OH, KW, KH,
|
|
OH * batch,
|
|
IC * KH * KW,
|
|
s0, s1, p0, p1, d0, d1, batch * IC
|
|
});
|
|
}
|
|
|
|
void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
|
|
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
|
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
|
const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
|
|
const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
|
|
const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
|
|
const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
|
|
const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
|
|
const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
|
|
const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
|
|
const int32_t IC = ((const int32_t *)(dst->op_params))[9];
|
|
|
|
const int64_t N = ne13 / IC;
|
|
const int64_t ID = ne12;
|
|
const int64_t IH = ne11;
|
|
const int64_t IW = ne10;
|
|
|
|
const int64_t KD = ne02;
|
|
const int64_t KH = ne01;
|
|
const int64_t KW = ne00;
|
|
|
|
const int64_t OD = ne3 / N;
|
|
const int64_t OH = ne2;
|
|
const int64_t OW = ne1;
|
|
|
|
const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
|
const vk_buffer d_buf = d_buf_ctx->dev_buffer;
|
|
|
|
const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_offset(dst) + dst->view_offs;
|
|
|
|
vk_op_im2col_3d_push_constants pc {};
|
|
|
|
pc.dst_addr = dst_addr;
|
|
pc.nb10 = nb10 / ggml_type_size(src1->type);
|
|
pc.nb11 = nb11 / ggml_type_size(src1->type);
|
|
pc.nb12 = nb12 / ggml_type_size(src1->type);
|
|
pc.nb13 = nb13 / ggml_type_size(src1->type);
|
|
pc.s0 = s0;
|
|
pc.s1 = s1;
|
|
pc.s2 = s2;
|
|
pc.p0 = p0;
|
|
pc.p1 = p1;
|
|
pc.p2 = p2;
|
|
pc.d0 = d0;
|
|
pc.d1 = d1;
|
|
pc.d2 = d2;
|
|
pc.IW = IW;
|
|
pc.IH = IH;
|
|
pc.ID = ID;
|
|
pc.IC = IC;
|
|
pc.KW = KW;
|
|
pc.OH = OH;
|
|
pc.KD_KH_KW = KD*KH*KW;
|
|
pc.KH_KW = KH*KW;
|
|
pc.IC_KD_KH_KW = IC*KD*KH*KW;
|
|
pc.N_OD_OH = N*OD*OH;
|
|
pc.OD_OH = OD*OH;
|
|
pc.OD_OH_OW_IC_KD_KH_KW = OD*OH*OW*IC*KD*KH*KW;
|
|
pc.OH_OW_IC_KD_KH_KW = OH*OW*IC*KD*KH*KW;
|
|
pc.OW_IC_KD_KH_KW = OW*IC*KD*KH*KW;
|
|
|
|
ggml_vk_op_f32<vk_op_im2col_3d_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL_3D, std::move(pc));
|
|
}
|
|
|
|
void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
const uint32_t dim = dst->op_params[0];
|
|
const uint32_t max_period = dst->op_params[1];
|
|
const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type);
|
|
|
|
ggml_vk_op_f32<vk_op_timestep_embedding_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, {
|
|
nb1, dim, max_period,
|
|
});
|
|
}
|
|
|
|
void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
// src0: (K, Cout, Cin, 1) -- kernel
|
|
// src1: (L, Cin, 1, 1) -- input
|
|
// dst: (*, Cout, 1, 1)
|
|
|
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
|
|
GGML_ASSERT(nb00 == sizeof(float));
|
|
GGML_ASSERT(nb10 == sizeof(float));
|
|
|
|
const int32_t s0 = dst->op_params[0];
|
|
|
|
vk_op_conv_transpose_1d_push_constants p{};
|
|
p.Cout = static_cast<uint32_t>(ne01);
|
|
p.Cin = static_cast<uint32_t>(ne02);
|
|
p.K = static_cast<uint32_t>(ne00);
|
|
p.L = static_cast<uint32_t>(ne10);
|
|
p.KL = static_cast<uint32_t>(ne0);
|
|
p.nb01 = static_cast<uint32_t>(nb01 / nb00);
|
|
p.nb02 = static_cast<uint32_t>(nb02 / nb00);
|
|
p.nb11 = static_cast<uint32_t>(nb11 / nb10);
|
|
p.nb1 = static_cast<uint32_t>(nb1 / nb0);
|
|
p.s0 = static_cast<uint32_t>(s0);
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_col2im_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
// src0: [K_OC, T_in] columns from matmul
|
|
// dst: [T_out, OC]
|
|
|
|
const int32_t stride = dst->op_params[0];
|
|
const int32_t oc = dst->op_params[1];
|
|
const int32_t p0 = dst->op_params[2];
|
|
|
|
const uint32_t K_OC = static_cast<uint32_t>(src0->ne[0]);
|
|
const uint32_t T_in = static_cast<uint32_t>(src0->ne[1]);
|
|
const uint32_t T_out = static_cast<uint32_t>(dst->ne[0]);
|
|
const uint32_t OC = static_cast<uint32_t>(oc);
|
|
const uint32_t K = K_OC / OC;
|
|
|
|
vk_op_col2im_1d_push_constants p{};
|
|
p.T_out = T_out;
|
|
p.OC = OC;
|
|
p.K_OC = K_OC;
|
|
p.T_in = T_in;
|
|
p.K = K;
|
|
p.stride = stride;
|
|
p.p0 = p0;
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COL2IM_1D, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_snake_dispatch_fused(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
|
|
const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0];
|
|
const ggml_tensor * sqr = cgraph->nodes[node_idx + 2];
|
|
const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3];
|
|
ggml_tensor * add = cgraph->nodes[node_idx + 4];
|
|
|
|
// x carries the full activation shape, a is the broadcast operand
|
|
const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1];
|
|
const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0];
|
|
|
|
// mul1 reads sqr and inv_b in either operand order
|
|
const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0];
|
|
|
|
vk_pipeline pipeline = nullptr;
|
|
switch (x->type) {
|
|
case GGML_TYPE_F32: pipeline = ctx->device->pipeline_snake_f32; break;
|
|
case GGML_TYPE_F16: pipeline = ctx->device->pipeline_snake_f16; break;
|
|
case GGML_TYPE_BF16: pipeline = ctx->device->pipeline_snake_bf16; break;
|
|
default: GGML_ABORT("unsupported type");
|
|
}
|
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
|
|
vk_subbuffer x_buf = ggml_vk_tensor_subbuffer(ctx, x);
|
|
vk_subbuffer a_buf = ggml_vk_tensor_subbuffer(ctx, a);
|
|
vk_subbuffer inv_b_buf = ggml_vk_tensor_subbuffer(ctx, inv_b);
|
|
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, add);
|
|
|
|
vk_op_snake_push_constants pc{};
|
|
pc.ne0 = static_cast<uint32_t>(x->ne[0]);
|
|
pc.ne1 = static_cast<uint32_t>(x->ne[1]);
|
|
|
|
std::array<uint32_t, 3> elements = { pc.ne0, pc.ne1, 1 };
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { x_buf, a_buf, inv_b_buf, dst_buf }, pc, elements);
|
|
}
|
|
|
|
void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
|
|
const int32_t k1 = dst->op_params[1];
|
|
const int32_t k0 = dst->op_params[2];
|
|
const int32_t s1 = dst->op_params[3];
|
|
const int32_t s0 = dst->op_params[4];
|
|
const int32_t p1 = dst->op_params[5];
|
|
const int32_t p0 = dst->op_params[6];
|
|
|
|
const uint32_t IH = src0->ne[1];
|
|
const uint32_t IW = src0->ne[0];
|
|
|
|
const uint32_t N = dst->ne[3];
|
|
|
|
const uint32_t OC = dst->ne[2];
|
|
const uint32_t OH = dst->ne[1];
|
|
const uint32_t OW = dst->ne[0];
|
|
|
|
const uint32_t parallel_elements = N * OC * OH * OW;
|
|
|
|
ggml_vk_op_f32<vk_op_pool2d_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_POOL_2D, {
|
|
IW, IH, OW, OH, OC,
|
|
parallel_elements,
|
|
op,
|
|
k0, k1, s0, s1, p0, p1,
|
|
});
|
|
}
|
|
|
|
void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
|
|
const ggml_tensor * src1, ggml_tensor * dst) {
|
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
|
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t));
|
|
GGML_ASSERT(nb10 == sizeof(float));
|
|
GGML_ASSERT(nb0 == sizeof(float));
|
|
|
|
bool transpose = dst->op == GGML_OP_CONV_TRANSPOSE_2D;
|
|
|
|
vk_op_conv2d_push_constants p{};
|
|
p.Cout = static_cast<uint32_t>(!transpose ? ne03 : ne02);
|
|
p.Cin = static_cast<uint32_t>(!transpose ? ne02 : ne03);
|
|
p.N = static_cast<uint32_t>(ne13);
|
|
GGML_ASSERT(p.Cout == ne2);
|
|
GGML_ASSERT(p.Cin == ne12);
|
|
|
|
p.W = static_cast<uint32_t>(ne10);
|
|
p.H = static_cast<uint32_t>(ne11);
|
|
p.OW = static_cast<uint32_t>(ne0);
|
|
p.OH = static_cast<uint32_t>(ne1);
|
|
|
|
p.nb01 = static_cast<uint32_t>(nb01 / nb00);
|
|
p.nb02 = static_cast<uint32_t>(nb02 / nb00);
|
|
p.nb03 = static_cast<uint32_t>(nb03 / nb00);
|
|
|
|
p.nb11 = static_cast<uint32_t>(nb11 / nb10);
|
|
p.nb12 = static_cast<uint32_t>(nb12 / nb10);
|
|
p.nb13 = static_cast<uint32_t>(nb13 / nb10);
|
|
|
|
p.nb1 = static_cast<uint32_t>(nb1 / nb0);
|
|
p.nb2 = static_cast<uint32_t>(nb2 / nb0);
|
|
p.nb3 = static_cast<uint32_t>(nb3 / nb0);
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, dst->op, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
vk_op_conv2d_dw_push_constants p{};
|
|
p.ne = ggml_nelements(dst);
|
|
p.channels = dst->ne[2];
|
|
p.batches = dst->ne[3];
|
|
p.dst_w = dst->ne[0];
|
|
p.dst_h = dst->ne[1];
|
|
p.src_w = src1->ne[0];
|
|
p.src_h = src1->ne[1];
|
|
p.knl_w = src0->ne[0];
|
|
p.knl_h = src0->ne[1];
|
|
p.stride_x = dst->op_params[0];
|
|
p.stride_y = dst->op_params[1];
|
|
p.pad_x = dst->op_params[2];
|
|
p.pad_y = dst->op_params[3];
|
|
p.dilation_x = dst->op_params[4];
|
|
p.dilation_y = dst->op_params[5];
|
|
|
|
GGML_ASSERT(src0->ne[3] == p.channels);
|
|
GGML_ASSERT(src1->ne[3] == p.batches);
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D_DW, std::move(p));
|
|
}
|
|
|
|
void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
const float * op_params = (const float *)dst->op_params;
|
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f, 0.0f, 0.0f });
|
|
}
|
|
|