[SYCL] support OPs: conv_2d, conv_2d_dw, conv2d_transpose (#24600)

* fix conflict

* fix format issue, rename

* rm debug code

* correct the file name
This commit is contained in:
Neo Zhang 2026-06-18 14:40:03 +08:00 committed by GitHub
parent 0b73fc79fe
commit 6f1034b32a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 2141 additions and 1587 deletions

View File

@ -161,6 +161,64 @@ You could update your test result in it directly.
Please refer to [Docker with SYCL](../docker.md#docker-with-sycl) for details.
## Quick Development WOW
This chapter is for quick development & try with SYCL backend on Intel GPU.
You need to install following sofeware before development:
- Intel GPU driver
- oneAPI package
- other development tools.
Please refer to [Linux](#linux) or [Windows](#windows-1) for above installation and resolve the trouble in usage. There are the detailed guide.
- Linux
```
## build from source code
./examples/sycl/build.sh
## run CONV_2D_DW unit test cases
./build/bin/test-backend-ops -b SYCL0 -o CONV_2D_DW
## run all unit test cases
./build/bin/test-backend-ops -b SYCL0
## run with LLM on the first GPU
./examples/sycl/test.sh -mg 0 -m xxxx.gguf
## run service with LLM on the first GPU
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
./examples/sycl/start-svr.sh -m xxxx.gguf
## update the docs/ops.md for new/update OPs
./examples/sycl/update-ops-doc.sh
```
- Windows
```
## build from source code
examples\sycl\win-build-sycl.bat
## run CONV_2D_DW unit test cases
build\bin\test-backend-ops.exe -b SYCL0 -o CONV_2D_DW
## run all unit test cases
build\bin\test-backend-ops.exe -b SYCL0
## run LLM on the first GPU
examples\sycl\win-test.bat -mg 0 -m xxxx.gguf
## run service with LLM on the first GPU
set ONEAPI_DEVICE_SELECTOR="level_zero:0"
examples\sycl\win-start-svr.bat -m xxxx.gguf
## update the docs/ops.md for new/update OPs
examples\sycl\win-update-ops-doc.bat
```
## Linux
### I. Setup Environment

View File

@ -27,11 +27,11 @@ Legend:
| COL2IM_1D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | ❌ | ❌ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | ❌ | ❌ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,9 @@
#!/bin/bash
# MIT license
# Copyright (C) 2026 Intel Corporation
# SPDX-License-Identifier: MIT
./build/bin/test-backend-ops support --output csv > docs/ops/SYCL.csv
./scripts/create_ops_docs.py

View File

@ -0,0 +1,8 @@
@echo off
rem MIT license
rem Copyright (C) 2026 Intel Corporation
rem SPDX-License-Identifier: MIT
build\bin\test-backend-ops support --output csv > docs\ops\SYCL.csv
python scripts\create_ops_docs.py

View File

@ -0,0 +1,158 @@
#include "conv2d-dw.hpp"
struct conv2d_dw_params {
int in_w, in_h;
int out_w, out_h;
int kernel_w, kernel_h;
int stride_x, stride_y;
int padding_x, padding_y;
int dilation_x, dilation_y;
int channels, batches;
};
struct conv2d_dw_kernel_bounds {
int y_min, y_max;
int x_min, x_max;
};
static inline conv2d_dw_kernel_bounds dw_calculate_kernel_bounds(int out_x, int out_y,
const conv2d_dw_params & p) {
conv2d_dw_kernel_bounds bounds;
bounds.y_min = sycl::max(0, (p.padding_y - out_y * p.stride_y + p.dilation_y - 1) / p.dilation_y);
bounds.y_max = sycl::min(p.kernel_h,
(p.in_h + p.padding_y - out_y * p.stride_y + p.dilation_y - 1) / p.dilation_y);
bounds.x_min = sycl::max(0, (p.padding_x - out_x * p.stride_x + p.dilation_x - 1) / p.dilation_x);
bounds.x_max = sycl::min(p.kernel_w,
(p.in_w + p.padding_x - out_x * p.stride_x + p.dilation_x - 1) / p.dilation_x);
return bounds;
}
static inline int dw_calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) {
return out_coord * stride + kern_coord * dilation - padding;
}
// whcn layout: input/output stored as [N, C, H, W]
struct dw_whcn_layout {
static int input_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
return n * (p.channels * p.in_w * p.in_h) + c * p.in_w * p.in_h + y * p.in_w + x;
}
static int kernel_index(int c, int ky, int kx, const conv2d_dw_params & p) {
return c * p.kernel_h * p.kernel_w + ky * p.kernel_w + kx;
}
static int output_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
return n * (p.channels * p.out_w * p.out_h) + c * p.out_w * p.out_h + y * p.out_w + x;
}
static void unpack_indices(int global_idx, const conv2d_dw_params & p,
int & n, int & c, int & out_y, int & out_x) {
out_x = global_idx % p.out_w;
out_y = (global_idx / p.out_w) % p.out_h;
c = (global_idx / (p.out_w * p.out_h)) % p.channels;
n = global_idx / (p.out_w * p.out_h * p.channels);
}
};
// cwhn layout: input/output stored as [N, H, W, C]
struct dw_cwhn_layout {
static int input_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
return n * (p.channels * p.in_w * p.in_h) + (y * p.in_w + x) * p.channels + c;
}
static int kernel_index(int c, int ky, int kx, const conv2d_dw_params & p) {
return (ky * p.kernel_w + kx) * p.channels + c;
}
static int output_index(int n, int c, int y, int x, const conv2d_dw_params & p) {
return n * (p.channels * p.out_w * p.out_h) + y * (p.out_w * p.channels) + x * p.channels + c;
}
static void unpack_indices(int global_idx, const conv2d_dw_params & p,
int & n, int & c, int & out_y, int & out_x) {
c = global_idx % p.channels;
out_x = (global_idx / p.channels) % p.out_w;
out_y = (global_idx / (p.channels * p.out_w)) % p.out_h;
n = global_idx / (p.channels * p.out_w * p.out_h);
}
};
template <typename Layout>
static void conv2d_dw_kernel(const float * input, const float * kernel, float * output,
const conv2d_dw_params p, const sycl::nd_item<3> & item_ct1) {
const int global_idx = item_ct1.get_local_id(2) +
item_ct1.get_group(2) * item_ct1.get_local_range(2);
const int total_elements = p.batches * p.channels * p.out_h * p.out_w;
if (global_idx >= total_elements) {
return;
}
int n, c, out_y, out_x;
Layout::unpack_indices(global_idx, p, n, c, out_y, out_x);
float acc = 0.0f;
const conv2d_dw_kernel_bounds bounds = dw_calculate_kernel_bounds(out_x, out_y, p);
for (int ky = bounds.y_min; ky < bounds.y_max; ++ky) {
const int in_y = dw_calculate_input_coord(out_y, ky, p.stride_y, p.dilation_y, p.padding_y);
for (int kx = bounds.x_min; kx < bounds.x_max; ++kx) {
const int in_x = dw_calculate_input_coord(out_x, kx, p.stride_x, p.dilation_x, p.padding_x);
acc += input[Layout::input_index(n, c, in_y, in_x, p)] *
kernel[Layout::kernel_index(c, ky, kx, p)];
}
}
output[Layout::output_index(n, c, out_y, out_x, p)] = acc;
}
template <typename Layout>
static void conv2d_dw_sycl(const float * x_d, const float * w_d, float * y_d,
const conv2d_dw_params p, const queue_ptr & stream) {
const int total = p.batches * p.channels * p.out_h * p.out_w;
const int num_blocks = (total + SYCL_CONV2D_DW_BLOCK_SIZE - 1) / SYCL_CONV2D_DW_BLOCK_SIZE;
const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_DW_BLOCK_SIZE);
const sycl::range<3> block_nums(1, 1, num_blocks);
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
conv2d_dw_kernel<Layout>(x_d, w_d, y_d, p, item_ct1);
});
}
void ggml_sycl_op_conv2d_dw(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
const ggml_tensor * kernel = dst->src[0];
const ggml_tensor * input = dst->src[1];
GGML_ASSERT(kernel->type == GGML_TYPE_F32 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
const float * w_d = (const float *) kernel->data;
const float * x_d = (const float *) input->data;
float * y_d = (float *) dst->data;
const int32_t * p = (const int32_t *) dst->op_params;
const int stride_x = p[0];
const int stride_y = p[1];
const int padding_x = p[2];
const int padding_y = p[3];
const int dilation_x = p[4];
const int dilation_y = p[5];
const int in_w = input->ne[0];
const int in_h = input->ne[1];
const int kernel_w = kernel->ne[0];
const int kernel_h = kernel->ne[1];
const int out_w = dst->ne[0];
const int out_h = dst->ne[1];
const int channels = dst->ne[2];
const int batches = dst->ne[3];
const conv2d_dw_params params = { in_w, in_h, out_w, out_h, kernel_w, kernel_h,
stride_x, stride_y, padding_x, padding_y,
dilation_x, dilation_y, channels, batches };
const queue_ptr stream = ctx.stream();
if (ggml_is_contiguous(input)) {
conv2d_dw_sycl<dw_whcn_layout>(x_d, w_d, y_d, params, stream);
} else if (ggml_is_contiguous_channels(input)) {
conv2d_dw_sycl<dw_cwhn_layout>(x_d, w_d, y_d, params, stream);
} else {
GGML_ABORT("Unsupported memory layout for conv2d_dw");
}
}

View File

@ -0,0 +1,10 @@
#ifndef GGML_SYCL_CONV2D_DW_HPP
#define GGML_SYCL_CONV2D_DW_HPP
#include "common.hpp"
#define SYCL_CONV2D_DW_BLOCK_SIZE 256
void ggml_sycl_op_conv2d_dw(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
#endif // GGML_SYCL_CONV2D_DW_HPP

View File

@ -0,0 +1,125 @@
#include "conv2d-transpose.hpp"
#include "convert.hpp"
template <typename kernel_t>
static void conv2d_transpose_kernel(const float * input, const kernel_t * kernel, float * output,
const int in_w, const int in_h,
const int out_w, const int out_h,
const int kernel_w, const int kernel_h,
const int stride,
const int c_in, const int c_out, const int batches,
const sycl::nd_item<3> & item_ct1) {
const int global_idx = item_ct1.get_local_id(2) +
item_ct1.get_group(2) * item_ct1.get_local_range(2);
const int total_elements = out_w * out_h * c_out * batches;
if (global_idx >= total_elements) {
return;
}
const int out_x = global_idx % out_w;
const int out_y = (global_idx / out_w) % out_h;
const int c_idx = (global_idx / (out_w * out_h)) % c_out;
const int n_idx = global_idx / (out_w * out_h * c_out);
float acc = 0.0f;
for (int c_in_idx = 0; c_in_idx < c_in; ++c_in_idx) {
for (int kh = 0; kh < kernel_h; ++kh) {
int in_y = out_y - kh;
if (in_y < 0 || in_y % stride) {
continue;
}
in_y /= stride;
if (in_y >= in_h) {
continue;
}
for (int kw = 0; kw < kernel_w; ++kw) {
int in_x = out_x - kw;
if (in_x < 0 || in_x % stride) {
continue;
}
in_x /= stride;
if (in_x >= in_w) {
continue;
}
const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + in_w * in_y + in_x;
const int kernel_idx = (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx +
kernel_w * kh + kw;
acc += input[input_idx] * ggml_sycl_cast<float>(kernel[kernel_idx]);
}
}
}
output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + out_w * out_y + out_x] = acc;
}
template <typename kernel_t>
static void conv2d_transpose_sycl(const float * input_d, const kernel_t * kernel_d, float * output_d,
const int in_w, const int in_h,
const int out_w, const int out_h,
const int kernel_w, const int kernel_h,
const int stride,
const int c_in, const int c_out, const int batches,
const queue_ptr & stream) {
const int total = out_w * out_h * c_out * batches;
const int num_blocks = (total + SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE;
const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE);
const sycl::range<3> block_nums(1, 1, num_blocks);
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
conv2d_transpose_kernel<kernel_t>(input_d, kernel_d, output_d,
in_w, in_h, out_w, out_h, kernel_w, kernel_h,
stride, c_in, c_out, batches, item_ct1);
});
}
// input: (W, H, C_in, N)
// kernel: (W, H, C_out, C_in)
// output: (W, H, C_out, N)
void ggml_sycl_op_conv2d_transpose(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
const ggml_tensor * kernel = dst->src[0];
const ggml_tensor * input = dst->src[1];
GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
GGML_ASSERT(input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(input));
GGML_ASSERT(ggml_is_contiguous(kernel));
GGML_ASSERT(ggml_is_contiguous(dst));
const float * input_d = (const float *) input->data;
float * output_d = (float *) dst->data;
const void * kernel_d = kernel->data;
const int input_w = input->ne[0];
const int input_h = input->ne[1];
const int channels_in = input->ne[2];
const int batches = input->ne[3];
const int output_w = dst->ne[0];
const int output_h = dst->ne[1];
const int channels_out = kernel->ne[2];
const int kernel_w = kernel->ne[0];
const int kernel_h = kernel->ne[1];
const int stride = dst->op_params[0];
GGML_ASSERT(channels_in == kernel->ne[3]);
GGML_ASSERT(stride > 0);
const queue_ptr stream = ctx.stream();
if (kernel->type == GGML_TYPE_F16) {
conv2d_transpose_sycl<sycl::half>(input_d, (const sycl::half *) kernel_d, output_d,
input_w, input_h, output_w, output_h, kernel_w, kernel_h,
stride, channels_in, channels_out, batches, stream);
} else {
conv2d_transpose_sycl<float>(input_d, (const float *) kernel_d, output_d,
input_w, input_h, output_w, output_h, kernel_w, kernel_h,
stride, channels_in, channels_out, batches, stream);
}
}

View File

@ -0,0 +1,10 @@
#ifndef GGML_SYCL_CONV2D_TRANSPOSE_HPP
#define GGML_SYCL_CONV2D_TRANSPOSE_HPP
#include "common.hpp"
#define SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE 256
void ggml_sycl_op_conv2d_transpose(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
#endif // GGML_SYCL_CONV2D_TRANSPOSE_HPP

View File

@ -0,0 +1,150 @@
#include "conv2d.hpp"
#include "convert.hpp"
struct conv2d_params {
const int64_t IW, IH;
const int64_t OW, OH;
const int64_t KW, KH;
const int64_t ST_X, ST_Y;
const int64_t PD_X, PD_Y;
const int64_t DL_X, DL_Y;
const int64_t IC, OC;
const int64_t B;
const int64_t TOTAL;
};
struct conv2d_kernel_bounds {
int64_t y_min, y_max;
int64_t x_min, x_max;
};
static inline int64_t conv2d_max64(int64_t a, int64_t b) {
return (a > b) ? a : b;
}
static inline int64_t conv2d_min64(int64_t a, int64_t b) {
return (a < b) ? a : b;
}
static inline conv2d_kernel_bounds calculate_kernel_bounds(int64_t out_x, int64_t out_y, const conv2d_params & P) {
conv2d_kernel_bounds bounds;
bounds.y_min = conv2d_max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
bounds.y_max = conv2d_min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
bounds.x_min = conv2d_max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
bounds.x_max = conv2d_min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
return bounds;
}
static inline int calculate_input_coord(int64_t out_coord, int64_t kern_coord, int64_t stride,
int64_t dilation, int64_t padding) {
return out_coord * stride + kern_coord * dilation - padding;
}
// whcn layout helpers (matching ggml tensor memory order)
static inline int64_t whcn_input_index(int64_t n, int64_t c, int64_t y, int64_t x, const conv2d_params & P) {
return n * (P.IC * P.IW * P.IH) + c * P.IW * P.IH + y * P.IW + x;
}
static inline int64_t whcn_kernel_index(int64_t c_out, int64_t c_in, int64_t ky, int64_t kx, const conv2d_params & P) {
return c_out * (P.IC * P.KH * P.KW) + c_in * (P.KH * P.KW) + ky * P.KW + kx;
}
static inline int64_t whcn_output_index(int64_t n, int64_t c, int64_t y, int64_t x, const conv2d_params & P) {
return n * (P.OC * P.OW * P.OH) + c * P.OW * P.OH + y * P.OW + x;
}
template <typename T>
static void conv2d_kernel(const float * input, const T * kernel, float * output,
const conv2d_params P, const sycl::nd_item<3> & item_ct1) {
const int64_t global_idx = item_ct1.get_local_id(2) +
item_ct1.get_group(2) * item_ct1.get_local_range(2);
if (global_idx >= P.TOTAL) {
return;
}
const int64_t out_x = global_idx % P.OW;
const int64_t out_y = (global_idx / P.OW) % P.OH;
const int64_t c_out = (global_idx / (P.OW * P.OH)) % P.OC;
const int64_t n = global_idx / (P.OW * P.OH * P.OC);
float acc = 0.0f;
const conv2d_kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P);
for (int64_t c_in = 0; c_in < P.IC; ++c_in) {
for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
const float input_val = input[whcn_input_index(n, c_in, in_y, in_x, P)];
const T kernel_val = kernel[whcn_kernel_index(c_out, c_in, ky, kx, P)];
acc += input_val * ggml_sycl_cast<float>(kernel_val);
}
}
}
output[whcn_output_index(n, c_out, out_y, out_x, P)] = acc;
}
template <typename T>
static void conv2d_sycl(const float * X_D, const T * K_D, float * Y_D,
const conv2d_params P, const queue_ptr & stream) {
const int num_blocks = (P.TOTAL + SYCL_CONV2D_BLOCK_SIZE - 1) / SYCL_CONV2D_BLOCK_SIZE;
const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_BLOCK_SIZE);
const sycl::range<3> block_nums(1, 1, num_blocks);
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
conv2d_kernel<T>(X_D, K_D, Y_D, P, item_ct1);
});
}
void ggml_sycl_op_conv2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
const ggml_tensor * kernel = dst->src[0];
const ggml_tensor * input = dst->src[1];
const float * K_D = (const float *) kernel->data;
const float * X_D = (const float *) input->data;
float * Y_D = (float *) dst->data;
GGML_ASSERT(ggml_is_contiguous(kernel));
GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
GGML_ASSERT(input->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
// same number of input channels
GGML_ASSERT(input->ne[2] == kernel->ne[2]);
const queue_ptr stream = ctx.stream();
const int32_t * p = (const int32_t *) dst->op_params;
const int ST_X = p[0];
const int ST_Y = p[1];
const int PD_X = p[2];
const int PD_Y = p[3];
const int DL_X = p[4];
const int DL_Y = p[5];
// no cwhn layout support
GGML_ASSERT(p[6] == 0);
const int IW = input->ne[0];
const int IH = input->ne[1];
const int OW = dst->ne[0];
const int OH = dst->ne[1];
const int KW = kernel->ne[0];
const int KH = kernel->ne[1];
const int IC = input->ne[2];
const int OC = kernel->ne[3];
const int B = input->ne[3];
const int64_t total = (int64_t) B * OC * OH * OW;
const conv2d_params params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total };
if (kernel->type == GGML_TYPE_F16) {
conv2d_sycl<sycl::half>(X_D, (const sycl::half *) K_D, Y_D, params, stream);
} else {
conv2d_sycl<float>(X_D, K_D, Y_D, params, stream);
}
}

View File

@ -0,0 +1,10 @@
#ifndef GGML_SYCL_CONV2D_HPP
#define GGML_SYCL_CONV2D_HPP
#include "common.hpp"
#define SYCL_CONV2D_BLOCK_SIZE 256
void ggml_sycl_op_conv2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
#endif // GGML_SYCL_CONV2D_HPP

View File

@ -62,6 +62,9 @@
#include "ggml-sycl/repeat_back.hpp"
#include "ggml-sycl/set_rows.hpp"
#include "ggml-sycl/set.hpp"
#include "ggml-sycl/conv2d.hpp"
#include "ggml-sycl/conv2d-dw.hpp"
#include "ggml-sycl/conv2d-transpose.hpp"
#include "ggml-sycl/ssm_conv.hpp"
#include "ggml-sycl/sycl_hw.hpp"
#include "ggml-sycl/ssm_scan.hpp"
@ -4664,12 +4667,21 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_OP_ARGMAX:
ggml_sycl_argmax(ctx, dst);
break;
case GGML_OP_CONV_TRANSPOSE_1D:
ggml_sycl_op_conv_transpose_1d(ctx, dst);
case GGML_OP_CONV_2D:
ggml_sycl_op_conv2d(ctx, dst);
break;
case GGML_OP_CONV_2D_DW:
ggml_sycl_op_conv2d_dw(ctx, dst);
break;
case GGML_OP_CONV_3D:
ggml_sycl_conv_3d(ctx, dst);
break;
case GGML_OP_CONV_TRANSPOSE_1D:
ggml_sycl_op_conv_transpose_1d(ctx, dst);
break;
case GGML_OP_CONV_TRANSPOSE_2D:
ggml_sycl_op_conv2d_transpose(ctx, dst);
break;
case GGML_OP_REPEAT:
ggml_sycl_repeat(ctx, dst);
break;
@ -5387,6 +5399,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
}
return false;
}
case GGML_OP_CONV_2D:
case GGML_OP_CONV_2D_DW:
case GGML_OP_CONV_TRANSPOSE_2D:
return true;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_SGN: