mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
[SYCL] support OPs: conv_2d, conv_2d_dw, conv2d_transpose (#24600)
* fix conflict * fix format issue, rename * rm debug code * correct the file name
This commit is contained in:
parent
0b73fc79fe
commit
6f1034b32a
@ -161,6 +161,64 @@ You could update your test result in it directly.
|
||||
|
||||
Please refer to [Docker with SYCL](../docker.md#docker-with-sycl) for details.
|
||||
|
||||
## Quick Development WOW
|
||||
|
||||
This chapter is for quick development & try with SYCL backend on Intel GPU.
|
||||
|
||||
You need to install following sofeware before development:
|
||||
- Intel GPU driver
|
||||
- oneAPI package
|
||||
- other development tools.
|
||||
|
||||
Please refer to [Linux](#linux) or [Windows](#windows-1) for above installation and resolve the trouble in usage. There are the detailed guide.
|
||||
|
||||
- Linux
|
||||
|
||||
```
|
||||
## build from source code
|
||||
./examples/sycl/build.sh
|
||||
|
||||
## run CONV_2D_DW unit test cases
|
||||
./build/bin/test-backend-ops -b SYCL0 -o CONV_2D_DW
|
||||
|
||||
## run all unit test cases
|
||||
./build/bin/test-backend-ops -b SYCL0
|
||||
|
||||
## run with LLM on the first GPU
|
||||
./examples/sycl/test.sh -mg 0 -m xxxx.gguf
|
||||
|
||||
## run service with LLM on the first GPU
|
||||
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
|
||||
./examples/sycl/start-svr.sh -m xxxx.gguf
|
||||
|
||||
## update the docs/ops.md for new/update OPs
|
||||
./examples/sycl/update-ops-doc.sh
|
||||
```
|
||||
|
||||
- Windows
|
||||
|
||||
```
|
||||
## build from source code
|
||||
examples\sycl\win-build-sycl.bat
|
||||
|
||||
## run CONV_2D_DW unit test cases
|
||||
build\bin\test-backend-ops.exe -b SYCL0 -o CONV_2D_DW
|
||||
|
||||
## run all unit test cases
|
||||
build\bin\test-backend-ops.exe -b SYCL0
|
||||
|
||||
## run LLM on the first GPU
|
||||
examples\sycl\win-test.bat -mg 0 -m xxxx.gguf
|
||||
|
||||
## run service with LLM on the first GPU
|
||||
set ONEAPI_DEVICE_SELECTOR="level_zero:0"
|
||||
examples\sycl\win-start-svr.bat -m xxxx.gguf
|
||||
|
||||
## update the docs/ops.md for new/update OPs
|
||||
examples\sycl\win-update-ops-doc.bat
|
||||
```
|
||||
|
||||
|
||||
## Linux
|
||||
|
||||
### I. Setup Environment
|
||||
|
||||
@ -27,11 +27,11 @@ Legend:
|
||||
| COL2IM_1D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||
|
||||
3164
docs/ops/SYCL.csv
3164
docs/ops/SYCL.csv
File diff suppressed because it is too large
Load Diff
9
examples/sycl/update-ops-doc.sh
Executable file
9
examples/sycl/update-ops-doc.sh
Executable file
@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
# MIT license
|
||||
# Copyright (C) 2026 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
./build/bin/test-backend-ops support --output csv > docs/ops/SYCL.csv
|
||||
./scripts/create_ops_docs.py
|
||||
|
||||
8
examples/sycl/win-update-ops-doc.bat
Normal file
8
examples/sycl/win-update-ops-doc.bat
Normal file
@ -0,0 +1,8 @@
|
||||
@echo off
|
||||
|
||||
rem MIT license
|
||||
rem Copyright (C) 2026 Intel Corporation
|
||||
rem SPDX-License-Identifier: MIT
|
||||
|
||||
build\bin\test-backend-ops support --output csv > docs\ops\SYCL.csv
|
||||
python scripts\create_ops_docs.py
|
||||
158
ggml/src/ggml-sycl/conv2d-dw.cpp
Normal file
158
ggml/src/ggml-sycl/conv2d-dw.cpp
Normal file
@ -0,0 +1,158 @@
|
||||
#include "conv2d-dw.hpp"
|
||||
|
||||
struct conv2d_dw_params {
|
||||
int in_w, in_h;
|
||||
int out_w, out_h;
|
||||
int kernel_w, kernel_h;
|
||||
int stride_x, stride_y;
|
||||
int padding_x, padding_y;
|
||||
int dilation_x, dilation_y;
|
||||
int channels, batches;
|
||||
};
|
||||
|
||||
struct conv2d_dw_kernel_bounds {
|
||||
int y_min, y_max;
|
||||
int x_min, x_max;
|
||||
};
|
||||
|
||||
static inline conv2d_dw_kernel_bounds dw_calculate_kernel_bounds(int out_x, int out_y,
|
||||
const conv2d_dw_params & p) {
|
||||
conv2d_dw_kernel_bounds bounds;
|
||||
bounds.y_min = sycl::max(0, (p.padding_y - out_y * p.stride_y + p.dilation_y - 1) / p.dilation_y);
|
||||
bounds.y_max = sycl::min(p.kernel_h,
|
||||
(p.in_h + p.padding_y - out_y * p.stride_y + p.dilation_y - 1) / p.dilation_y);
|
||||
bounds.x_min = sycl::max(0, (p.padding_x - out_x * p.stride_x + p.dilation_x - 1) / p.dilation_x);
|
||||
bounds.x_max = sycl::min(p.kernel_w,
|
||||
(p.in_w + p.padding_x - out_x * p.stride_x + p.dilation_x - 1) / p.dilation_x);
|
||||
return bounds;
|
||||
}
|
||||
|
||||
static inline int dw_calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) {
|
||||
return out_coord * stride + kern_coord * dilation - padding;
|
||||
}
|
||||
|
||||
// whcn layout: input/output stored as [N, C, H, W]
|
||||
struct dw_whcn_layout {
|
||||
static int input_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
|
||||
return n * (p.channels * p.in_w * p.in_h) + c * p.in_w * p.in_h + y * p.in_w + x;
|
||||
}
|
||||
static int kernel_index(int c, int ky, int kx, const conv2d_dw_params & p) {
|
||||
return c * p.kernel_h * p.kernel_w + ky * p.kernel_w + kx;
|
||||
}
|
||||
static int output_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
|
||||
return n * (p.channels * p.out_w * p.out_h) + c * p.out_w * p.out_h + y * p.out_w + x;
|
||||
}
|
||||
static void unpack_indices(int global_idx, const conv2d_dw_params & p,
|
||||
int & n, int & c, int & out_y, int & out_x) {
|
||||
out_x = global_idx % p.out_w;
|
||||
out_y = (global_idx / p.out_w) % p.out_h;
|
||||
c = (global_idx / (p.out_w * p.out_h)) % p.channels;
|
||||
n = global_idx / (p.out_w * p.out_h * p.channels);
|
||||
}
|
||||
};
|
||||
|
||||
// cwhn layout: input/output stored as [N, H, W, C]
|
||||
struct dw_cwhn_layout {
|
||||
static int input_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
|
||||
return n * (p.channels * p.in_w * p.in_h) + (y * p.in_w + x) * p.channels + c;
|
||||
}
|
||||
static int kernel_index(int c, int ky, int kx, const conv2d_dw_params & p) {
|
||||
return (ky * p.kernel_w + kx) * p.channels + c;
|
||||
}
|
||||
static int output_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
|
||||
return n * (p.channels * p.out_w * p.out_h) + y * (p.out_w * p.channels) + x * p.channels + c;
|
||||
}
|
||||
static void unpack_indices(int global_idx, const conv2d_dw_params & p,
|
||||
int & n, int & c, int & out_y, int & out_x) {
|
||||
c = global_idx % p.channels;
|
||||
out_x = (global_idx / p.channels) % p.out_w;
|
||||
out_y = (global_idx / (p.channels * p.out_w)) % p.out_h;
|
||||
n = global_idx / (p.channels * p.out_w * p.out_h);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Layout>
|
||||
static void conv2d_dw_kernel(const float * input, const float * kernel, float * output,
|
||||
const conv2d_dw_params p, const sycl::nd_item<3> & item_ct1) {
|
||||
const int global_idx = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
const int total_elements = p.batches * p.channels * p.out_h * p.out_w;
|
||||
|
||||
if (global_idx >= total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
int n, c, out_y, out_x;
|
||||
Layout::unpack_indices(global_idx, p, n, c, out_y, out_x);
|
||||
|
||||
float acc = 0.0f;
|
||||
const conv2d_dw_kernel_bounds bounds = dw_calculate_kernel_bounds(out_x, out_y, p);
|
||||
|
||||
for (int ky = bounds.y_min; ky < bounds.y_max; ++ky) {
|
||||
const int in_y = dw_calculate_input_coord(out_y, ky, p.stride_y, p.dilation_y, p.padding_y);
|
||||
for (int kx = bounds.x_min; kx < bounds.x_max; ++kx) {
|
||||
const int in_x = dw_calculate_input_coord(out_x, kx, p.stride_x, p.dilation_x, p.padding_x);
|
||||
acc += input[Layout::input_index(n, c, in_y, in_x, p)] *
|
||||
kernel[Layout::kernel_index(c, ky, kx, p)];
|
||||
}
|
||||
}
|
||||
|
||||
output[Layout::output_index(n, c, out_y, out_x, p)] = acc;
|
||||
}
|
||||
|
||||
template <typename Layout>
|
||||
static void conv2d_dw_sycl(const float * x_d, const float * w_d, float * y_d,
|
||||
const conv2d_dw_params p, const queue_ptr & stream) {
|
||||
const int total = p.batches * p.channels * p.out_h * p.out_w;
|
||||
const int num_blocks = (total + SYCL_CONV2D_DW_BLOCK_SIZE - 1) / SYCL_CONV2D_DW_BLOCK_SIZE;
|
||||
const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_DW_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(1, 1, num_blocks);
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
conv2d_dw_kernel<Layout>(x_d, w_d, y_d, p, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_sycl_op_conv2d_dw(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * input = dst->src[1];
|
||||
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F32 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
|
||||
const float * w_d = (const float *) kernel->data;
|
||||
const float * x_d = (const float *) input->data;
|
||||
float * y_d = (float *) dst->data;
|
||||
|
||||
const int32_t * p = (const int32_t *) dst->op_params;
|
||||
const int stride_x = p[0];
|
||||
const int stride_y = p[1];
|
||||
const int padding_x = p[2];
|
||||
const int padding_y = p[3];
|
||||
const int dilation_x = p[4];
|
||||
const int dilation_y = p[5];
|
||||
|
||||
const int in_w = input->ne[0];
|
||||
const int in_h = input->ne[1];
|
||||
const int kernel_w = kernel->ne[0];
|
||||
const int kernel_h = kernel->ne[1];
|
||||
const int out_w = dst->ne[0];
|
||||
const int out_h = dst->ne[1];
|
||||
const int channels = dst->ne[2];
|
||||
const int batches = dst->ne[3];
|
||||
|
||||
const conv2d_dw_params params = { in_w, in_h, out_w, out_h, kernel_w, kernel_h,
|
||||
stride_x, stride_y, padding_x, padding_y,
|
||||
dilation_x, dilation_y, channels, batches };
|
||||
|
||||
const queue_ptr stream = ctx.stream();
|
||||
|
||||
if (ggml_is_contiguous(input)) {
|
||||
conv2d_dw_sycl<dw_whcn_layout>(x_d, w_d, y_d, params, stream);
|
||||
} else if (ggml_is_contiguous_channels(input)) {
|
||||
conv2d_dw_sycl<dw_cwhn_layout>(x_d, w_d, y_d, params, stream);
|
||||
} else {
|
||||
GGML_ABORT("Unsupported memory layout for conv2d_dw");
|
||||
}
|
||||
}
|
||||
10
ggml/src/ggml-sycl/conv2d-dw.hpp
Normal file
10
ggml/src/ggml-sycl/conv2d-dw.hpp
Normal file
@ -0,0 +1,10 @@
|
||||
#ifndef GGML_SYCL_CONV2D_DW_HPP
|
||||
#define GGML_SYCL_CONV2D_DW_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#define SYCL_CONV2D_DW_BLOCK_SIZE 256
|
||||
|
||||
void ggml_sycl_op_conv2d_dw(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_CONV2D_DW_HPP
|
||||
125
ggml/src/ggml-sycl/conv2d-transpose.cpp
Normal file
125
ggml/src/ggml-sycl/conv2d-transpose.cpp
Normal file
@ -0,0 +1,125 @@
|
||||
#include "conv2d-transpose.hpp"
|
||||
#include "convert.hpp"
|
||||
|
||||
template <typename kernel_t>
|
||||
static void conv2d_transpose_kernel(const float * input, const kernel_t * kernel, float * output,
|
||||
const int in_w, const int in_h,
|
||||
const int out_w, const int out_h,
|
||||
const int kernel_w, const int kernel_h,
|
||||
const int stride,
|
||||
const int c_in, const int c_out, const int batches,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
const int global_idx = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
const int total_elements = out_w * out_h * c_out * batches;
|
||||
|
||||
if (global_idx >= total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int out_x = global_idx % out_w;
|
||||
const int out_y = (global_idx / out_w) % out_h;
|
||||
const int c_idx = (global_idx / (out_w * out_h)) % c_out;
|
||||
const int n_idx = global_idx / (out_w * out_h * c_out);
|
||||
|
||||
float acc = 0.0f;
|
||||
|
||||
for (int c_in_idx = 0; c_in_idx < c_in; ++c_in_idx) {
|
||||
for (int kh = 0; kh < kernel_h; ++kh) {
|
||||
int in_y = out_y - kh;
|
||||
if (in_y < 0 || in_y % stride) {
|
||||
continue;
|
||||
}
|
||||
in_y /= stride;
|
||||
if (in_y >= in_h) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int kw = 0; kw < kernel_w; ++kw) {
|
||||
int in_x = out_x - kw;
|
||||
if (in_x < 0 || in_x % stride) {
|
||||
continue;
|
||||
}
|
||||
in_x /= stride;
|
||||
if (in_x >= in_w) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + in_w * in_y + in_x;
|
||||
const int kernel_idx = (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx +
|
||||
kernel_w * kh + kw;
|
||||
|
||||
acc += input[input_idx] * ggml_sycl_cast<float>(kernel[kernel_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + out_w * out_y + out_x] = acc;
|
||||
}
|
||||
|
||||
template <typename kernel_t>
|
||||
static void conv2d_transpose_sycl(const float * input_d, const kernel_t * kernel_d, float * output_d,
|
||||
const int in_w, const int in_h,
|
||||
const int out_w, const int out_h,
|
||||
const int kernel_w, const int kernel_h,
|
||||
const int stride,
|
||||
const int c_in, const int c_out, const int batches,
|
||||
const queue_ptr & stream) {
|
||||
const int total = out_w * out_h * c_out * batches;
|
||||
const int num_blocks = (total + SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE;
|
||||
const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(1, 1, num_blocks);
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
conv2d_transpose_kernel<kernel_t>(input_d, kernel_d, output_d,
|
||||
in_w, in_h, out_w, out_h, kernel_w, kernel_h,
|
||||
stride, c_in, c_out, batches, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
// input: (W, H, C_in, N)
|
||||
// kernel: (W, H, C_out, C_in)
|
||||
// output: (W, H, C_out, N)
|
||||
void ggml_sycl_op_conv2d_transpose(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * input = dst->src[1];
|
||||
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(input));
|
||||
GGML_ASSERT(ggml_is_contiguous(kernel));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
const float * input_d = (const float *) input->data;
|
||||
float * output_d = (float *) dst->data;
|
||||
const void * kernel_d = kernel->data;
|
||||
|
||||
const int input_w = input->ne[0];
|
||||
const int input_h = input->ne[1];
|
||||
const int channels_in = input->ne[2];
|
||||
const int batches = input->ne[3];
|
||||
const int output_w = dst->ne[0];
|
||||
const int output_h = dst->ne[1];
|
||||
const int channels_out = kernel->ne[2];
|
||||
const int kernel_w = kernel->ne[0];
|
||||
const int kernel_h = kernel->ne[1];
|
||||
const int stride = dst->op_params[0];
|
||||
|
||||
GGML_ASSERT(channels_in == kernel->ne[3]);
|
||||
GGML_ASSERT(stride > 0);
|
||||
|
||||
const queue_ptr stream = ctx.stream();
|
||||
|
||||
if (kernel->type == GGML_TYPE_F16) {
|
||||
conv2d_transpose_sycl<sycl::half>(input_d, (const sycl::half *) kernel_d, output_d,
|
||||
input_w, input_h, output_w, output_h, kernel_w, kernel_h,
|
||||
stride, channels_in, channels_out, batches, stream);
|
||||
} else {
|
||||
conv2d_transpose_sycl<float>(input_d, (const float *) kernel_d, output_d,
|
||||
input_w, input_h, output_w, output_h, kernel_w, kernel_h,
|
||||
stride, channels_in, channels_out, batches, stream);
|
||||
}
|
||||
}
|
||||
10
ggml/src/ggml-sycl/conv2d-transpose.hpp
Normal file
10
ggml/src/ggml-sycl/conv2d-transpose.hpp
Normal file
@ -0,0 +1,10 @@
|
||||
#ifndef GGML_SYCL_CONV2D_TRANSPOSE_HPP
|
||||
#define GGML_SYCL_CONV2D_TRANSPOSE_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#define SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE 256
|
||||
|
||||
void ggml_sycl_op_conv2d_transpose(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_CONV2D_TRANSPOSE_HPP
|
||||
150
ggml/src/ggml-sycl/conv2d.cpp
Normal file
150
ggml/src/ggml-sycl/conv2d.cpp
Normal file
@ -0,0 +1,150 @@
|
||||
#include "conv2d.hpp"
|
||||
#include "convert.hpp"
|
||||
|
||||
struct conv2d_params {
|
||||
const int64_t IW, IH;
|
||||
const int64_t OW, OH;
|
||||
const int64_t KW, KH;
|
||||
const int64_t ST_X, ST_Y;
|
||||
const int64_t PD_X, PD_Y;
|
||||
const int64_t DL_X, DL_Y;
|
||||
const int64_t IC, OC;
|
||||
const int64_t B;
|
||||
const int64_t TOTAL;
|
||||
};
|
||||
|
||||
struct conv2d_kernel_bounds {
|
||||
int64_t y_min, y_max;
|
||||
int64_t x_min, x_max;
|
||||
};
|
||||
|
||||
static inline int64_t conv2d_max64(int64_t a, int64_t b) {
|
||||
return (a > b) ? a : b;
|
||||
}
|
||||
|
||||
static inline int64_t conv2d_min64(int64_t a, int64_t b) {
|
||||
return (a < b) ? a : b;
|
||||
}
|
||||
|
||||
static inline conv2d_kernel_bounds calculate_kernel_bounds(int64_t out_x, int64_t out_y, const conv2d_params & P) {
|
||||
conv2d_kernel_bounds bounds;
|
||||
bounds.y_min = conv2d_max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
|
||||
bounds.y_max = conv2d_min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
|
||||
bounds.x_min = conv2d_max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
|
||||
bounds.x_max = conv2d_min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
|
||||
return bounds;
|
||||
}
|
||||
|
||||
static inline int calculate_input_coord(int64_t out_coord, int64_t kern_coord, int64_t stride,
|
||||
int64_t dilation, int64_t padding) {
|
||||
return out_coord * stride + kern_coord * dilation - padding;
|
||||
}
|
||||
|
||||
// whcn layout helpers (matching ggml tensor memory order)
|
||||
static inline int64_t whcn_input_index(int64_t n, int64_t c, int64_t y, int64_t x, const conv2d_params & P) {
|
||||
return n * (P.IC * P.IW * P.IH) + c * P.IW * P.IH + y * P.IW + x;
|
||||
}
|
||||
|
||||
static inline int64_t whcn_kernel_index(int64_t c_out, int64_t c_in, int64_t ky, int64_t kx, const conv2d_params & P) {
|
||||
return c_out * (P.IC * P.KH * P.KW) + c_in * (P.KH * P.KW) + ky * P.KW + kx;
|
||||
}
|
||||
|
||||
static inline int64_t whcn_output_index(int64_t n, int64_t c, int64_t y, int64_t x, const conv2d_params & P) {
|
||||
return n * (P.OC * P.OW * P.OH) + c * P.OW * P.OH + y * P.OW + x;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void conv2d_kernel(const float * input, const T * kernel, float * output,
|
||||
const conv2d_params P, const sycl::nd_item<3> & item_ct1) {
|
||||
const int64_t global_idx = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
|
||||
if (global_idx >= P.TOTAL) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t out_x = global_idx % P.OW;
|
||||
const int64_t out_y = (global_idx / P.OW) % P.OH;
|
||||
const int64_t c_out = (global_idx / (P.OW * P.OH)) % P.OC;
|
||||
const int64_t n = global_idx / (P.OW * P.OH * P.OC);
|
||||
|
||||
float acc = 0.0f;
|
||||
|
||||
const conv2d_kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P);
|
||||
|
||||
for (int64_t c_in = 0; c_in < P.IC; ++c_in) {
|
||||
for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
|
||||
const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
|
||||
for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
|
||||
const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
|
||||
const float input_val = input[whcn_input_index(n, c_in, in_y, in_x, P)];
|
||||
const T kernel_val = kernel[whcn_kernel_index(c_out, c_in, ky, kx, P)];
|
||||
acc += input_val * ggml_sycl_cast<float>(kernel_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output[whcn_output_index(n, c_out, out_y, out_x, P)] = acc;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void conv2d_sycl(const float * X_D, const T * K_D, float * Y_D,
|
||||
const conv2d_params P, const queue_ptr & stream) {
|
||||
const int num_blocks = (P.TOTAL + SYCL_CONV2D_BLOCK_SIZE - 1) / SYCL_CONV2D_BLOCK_SIZE;
|
||||
const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(1, 1, num_blocks);
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
conv2d_kernel<T>(X_D, K_D, Y_D, P, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_sycl_op_conv2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * input = dst->src[1];
|
||||
const float * K_D = (const float *) kernel->data;
|
||||
const float * X_D = (const float *) input->data;
|
||||
float * Y_D = (float *) dst->data;
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(kernel));
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(input->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
// same number of input channels
|
||||
GGML_ASSERT(input->ne[2] == kernel->ne[2]);
|
||||
|
||||
const queue_ptr stream = ctx.stream();
|
||||
|
||||
const int32_t * p = (const int32_t *) dst->op_params;
|
||||
const int ST_X = p[0];
|
||||
const int ST_Y = p[1];
|
||||
const int PD_X = p[2];
|
||||
const int PD_Y = p[3];
|
||||
const int DL_X = p[4];
|
||||
const int DL_Y = p[5];
|
||||
|
||||
// no cwhn layout support
|
||||
GGML_ASSERT(p[6] == 0);
|
||||
|
||||
const int IW = input->ne[0];
|
||||
const int IH = input->ne[1];
|
||||
const int OW = dst->ne[0];
|
||||
const int OH = dst->ne[1];
|
||||
const int KW = kernel->ne[0];
|
||||
const int KH = kernel->ne[1];
|
||||
const int IC = input->ne[2];
|
||||
const int OC = kernel->ne[3];
|
||||
const int B = input->ne[3];
|
||||
|
||||
const int64_t total = (int64_t) B * OC * OH * OW;
|
||||
const conv2d_params params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total };
|
||||
|
||||
if (kernel->type == GGML_TYPE_F16) {
|
||||
conv2d_sycl<sycl::half>(X_D, (const sycl::half *) K_D, Y_D, params, stream);
|
||||
} else {
|
||||
conv2d_sycl<float>(X_D, K_D, Y_D, params, stream);
|
||||
}
|
||||
}
|
||||
10
ggml/src/ggml-sycl/conv2d.hpp
Normal file
10
ggml/src/ggml-sycl/conv2d.hpp
Normal file
@ -0,0 +1,10 @@
|
||||
#ifndef GGML_SYCL_CONV2D_HPP
|
||||
#define GGML_SYCL_CONV2D_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#define SYCL_CONV2D_BLOCK_SIZE 256
|
||||
|
||||
void ggml_sycl_op_conv2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_CONV2D_HPP
|
||||
@ -62,6 +62,9 @@
|
||||
#include "ggml-sycl/repeat_back.hpp"
|
||||
#include "ggml-sycl/set_rows.hpp"
|
||||
#include "ggml-sycl/set.hpp"
|
||||
#include "ggml-sycl/conv2d.hpp"
|
||||
#include "ggml-sycl/conv2d-dw.hpp"
|
||||
#include "ggml-sycl/conv2d-transpose.hpp"
|
||||
#include "ggml-sycl/ssm_conv.hpp"
|
||||
#include "ggml-sycl/sycl_hw.hpp"
|
||||
#include "ggml-sycl/ssm_scan.hpp"
|
||||
@ -4664,12 +4667,21 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||
case GGML_OP_ARGMAX:
|
||||
ggml_sycl_argmax(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_sycl_op_conv_transpose_1d(ctx, dst);
|
||||
case GGML_OP_CONV_2D:
|
||||
ggml_sycl_op_conv2d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
ggml_sycl_op_conv2d_dw(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_3D:
|
||||
ggml_sycl_conv_3d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_sycl_op_conv_transpose_1d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
ggml_sycl_op_conv2d_transpose(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_REPEAT:
|
||||
ggml_sycl_repeat(ctx, dst);
|
||||
break;
|
||||
@ -5387,6 +5399,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
}
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
return true;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_SGN:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user