From 72be44f1d25a3d7cc9c2f7f868337d5dd1fad4a7 Mon Sep 17 00:00:00 2001 From: Neo Zhang Date: Mon, 15 Jun 2026 15:08:34 +0800 Subject: [PATCH] sycl : fix reorder function; add fp32/fp16 in build script (#24578) --- examples/sycl/build.sh | 40 ++++++++-- examples/sycl/win-build-sycl.bat | 31 ++++++-- ggml/src/ggml-sycl/mmvq.cpp | 130 +++++++++++++++---------------- 3 files changed, 123 insertions(+), 78 deletions(-) diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh index bf7d6b53bf..9dd66cb676 100755 --- a/examples/sycl/build.sh +++ b/examples/sycl/build.sh @@ -3,15 +3,45 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: MIT +print_usage() { + echo "Usage: ./build.sh [fp32|fp16] [--help]" + echo "" + echo "Options:" + echo " fp32 Build with FP32 precision (default)" + echo " fp16 Build with FP16 precision (faster for long-prompt inference)" + echo " --help Print this help message" +} + +PRECISION=fp32 + +for arg in "$@"; do + case "$arg" in + --help) + print_usage + exit 0 + ;; + fp32|fp16) + PRECISION="$arg" + ;; + *) + echo "Error: unknown option '$arg'" + print_usage + exit 1 + ;; + esac +done + mkdir -p build cd build source /opt/intel/oneapi/setvars.sh -#for FP16 -#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference - -#for FP32 -cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF +if [ "$PRECISION" = "fp16" ]; then + #for FP16 + cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference +else + #for FP32 + cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF +fi #build example/main #cmake --build . --config Release --target main diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat index fc8b33bbc2..9a82edbefe 100644 --- a/examples/sycl/win-build-sycl.bat +++ b/examples/sycl/win-build-sycl.bat @@ -3,6 +3,23 @@ :: Copyright (C) 2024 Intel Corporation :: SPDX-License-Identifier: MIT +IF /I "%1"=="--help" ( + echo Usage: win-build-sycl.bat [fp32^|fp16] [--help] + echo. + echo Options: + echo fp32 Build with FP32 precision ^(default^) + echo fp16 Build with FP16 precision ^(faster for long-prompt inference^) + echo --help Print this help message + exit /B 0 +) + +SET PRECISION=%1 +IF "%PRECISION%"=="" SET PRECISION=fp32 +IF /I NOT "%PRECISION%"=="fp32" IF /I NOT "%PRECISION%"=="fp16" ( + echo Error: invalid value '%PRECISION%'. Use 'fp32' or 'fp16'. + echo Usage: win-build-sycl.bat [fp32^|fp16] [--help] + exit /B 1 +) IF not exist build (mkdir build) cd build @@ -11,12 +28,14 @@ if %errorlevel% neq 0 goto ERROR @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force if %errorlevel% neq 0 goto ERROR -:: for FP16 -:: faster for long-prompt inference -:: cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON - -:: for FP32 -cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release +IF /I "%PRECISION%"=="fp16" ( + :: for FP16 + :: faster for long-prompt inference + cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON +) ELSE ( + :: for FP32 + cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release +) if %errorlevel% neq 0 goto ERROR :: build all binary diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index cf2b59576a..3a3daf4f1d 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -662,13 +662,12 @@ static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, GGML_ASSERT(ncols % QK4_0 == 0); // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. constexpr size_t num_subgroups = WARP_SIZE; - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; - - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE)); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, nd_item); @@ -683,13 +682,13 @@ static void reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols( const int stride_col_y_bytes, const int stride_col_dst, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK4_0 == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder_ncols, ncols_dst>( vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); @@ -1080,13 +1079,12 @@ static void reorder_mul_mat_vec_q8_0_q8_1_sycl(const void * vx, const void * vy, GGML_ASSERT(ncols % QK8_0 == 0); // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. constexpr size_t num_subgroups = WARP_SIZE; - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; - - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE)); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, nd_item); @@ -1101,13 +1099,13 @@ static void reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols( const int stride_col_y_bytes, const int stride_col_dst, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK8_0 == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder_ncols, ncols_dst>( vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); @@ -1289,13 +1287,12 @@ static void reorder_mul_mat_vec_q3_k_q8_1_sycl(const void * vx, const void * vy, // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. constexpr size_t num_subgroups = WARP_SIZE; - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; - - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, nd_item); @@ -1310,13 +1307,13 @@ static void reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols( const int stride_col_y_bytes, const int stride_col_dst, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder_ncols, ncols_dst>( vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); @@ -1457,13 +1454,12 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. constexpr size_t num_subgroups = WARP_SIZE; - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; - - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, nd_item); @@ -1478,13 +1474,14 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols( const int stride_col_y_bytes, const int stride_col_dst, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder_ncols, ncols_dst>( vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); @@ -1583,15 +1580,13 @@ static void reorder_mul_mat_vec_q5_k_q8_1_sycl(const void * vx, const void * vy, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); - - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, nd_item); @@ -1606,13 +1601,14 @@ static void reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols( const int stride_col_y_bytes, const int stride_col_dst, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder_ncols, ncols_dst>( vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); @@ -1643,13 +1639,13 @@ static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, GGML_ASSERT(ncols % QK_K == 0); // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. constexpr size_t num_subgroups = WARP_SIZE; - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, nd_item); @@ -1664,13 +1660,13 @@ static void reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols( const int stride_col_y_bytes, const int stride_col_dst, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder_ncols, ncols_dst>( vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item);