From 74a80dd9c052bd2d3e3e8134436ad05cc6396d1a Mon Sep 17 00:00:00 2001 From: Neo Zhang Date: Wed, 17 Jun 2026 22:21:34 +0800 Subject: [PATCH] [SYCL] add dev2dev memcpy by SYCL API (#24476) * add dev2dev memcpy by SYCL API * mv GGML_SYCL_DEV2DEV_MEMCPY to runntime table * update the detect method for p2p comm * fix the erro created during fix confilct --------- Co-authored-by: Neo Zhang --- docs/backend/SYCL.md | 2 ++ ggml/src/ggml-sycl/common.hpp | 6 ++++ ggml/src/ggml-sycl/dpct/helper.hpp | 22 +++++++++---- ggml/src/ggml-sycl/ggml-sycl.cpp | 52 ++++++++++++++++++++++-------- 4 files changed, 61 insertions(+), 21 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 97d4b52162..6617aa2a51 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -712,6 +712,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | Name | Value | Function | |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------| | GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG | +| GGML_SYCL_DEV2DEV_MEMCPY | 0 (default) or 1 | Choose the SYCL or L0 API in dev2dev memory copy.
Value:
* 0: SYCL API (default)
* 1: L0 API -- L0 API is found to lead to abnormal crash in some case. This debug flag is used to check the issue.| | GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.| | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for Intel devices older than Gen 10) | | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. | @@ -731,6 +732,7 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo | DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. | | DEBUG_SYCL_MALLOC | Enable verbose per-call logging of device pool alloc/free operations. | + ## Design Rule - Open to all contributors. diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 96586ea464..c87a4636e2 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -62,6 +62,7 @@ extern int g_ggml_sycl_debug; extern int g_ggml_sycl_disable_optimize; extern int g_ggml_sycl_prioritize_dmmv; extern int g_ggml_sycl_enable_flash_attention; +extern int g_ggml_sycl_dev2dev_memcpy; #if defined(__clang__) && __has_builtin(__builtin_expect) @@ -126,6 +127,11 @@ enum ggml_sycl_backend_gpu_mode { SYCL_MUL_GPU_MODE }; +enum ggml_sycl_dev2dev_memcpy_mode { + DEV2DEV_MEMCPY_SYCL = 0, + DEV2DEV_MEMCPY_L0 = 1, +}; + static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); static void crash() { diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index 791d3cac52..664b8e9697 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -13,14 +13,14 @@ #ifndef GGML_SYCL_DPCT_HELPER_HPP #define GGML_SYCL_DPCT_HELPER_HPP +#include +#include +#include + #include #include #include -#include - -#include "ggml.h" - #if defined(__linux__) #include #elif defined(_WIN64) @@ -43,6 +43,7 @@ #include #endif + #define DPCT_COMPATIBILITY_TEMP (900) #if defined(_MSC_VER) @@ -59,6 +60,13 @@ #define __dpct_noinline__ __attribute__((noinline)) #endif +#define DPCT_UNUSED(x) (void)(x) + +inline void _abort(const char * str) { + std::cerr << str << std::endl; + std::abort(); +} + inline std::string get_device_type_name(const sycl::device &Device) { auto DeviceType = Device.get_info(); switch (DeviceType) { @@ -1017,7 +1025,7 @@ namespace dpct if (backend == "opencl:cpu") return 4; if (backend == "opencl:acc") return 5; printf("convert_backend_index: can't handle backend=%s\n", backend.c_str()); - GGML_ABORT("fatal error"); + _abort("fatal error"); } static bool compare_backend(std::string &backend1, std::string &backend2) { return convert_backend_index(backend1) < convert_backend_index(backend2); @@ -1426,7 +1434,7 @@ namespace dpct if (!size) return sycl::event{}; return q.memcpy(to_ptr, from_ptr, size, dep_events); - GGML_UNUSED(direction); + DPCT_UNUSED(direction); } // Get actual copy range and make sure it will not exceed range. @@ -2092,7 +2100,7 @@ namespace dpct if (!size) return sycl::event{}; return q.memcpy(to_ptr, from_ptr, size, dep_events); - GGML_UNUSED(direction); + DPCT_UNUSED(direction); } // Get actual copy range and make sure it will not exceed range. diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 43c7e0a933..4c0567669a 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -86,6 +86,7 @@ int g_ggml_sycl_use_async_mem_op = 0; int g_ggml_sycl_use_async_mem_op_requested = 1; int g_ggml_sycl_enable_level_zero = 0; int g_ggml_sycl_enable_flash_attention = 1; +int g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL; int g_ggml_sycl_usm_system = 0; static ggml_sycl_device_info ggml_sycl_init() { @@ -272,6 +273,11 @@ static void ggml_check_sycl() try { g_ggml_sycl_enable_vmm = ggml_sycl_get_env("GGML_SYCL_ENABLE_VMM", 1); g_ggml_sycl_prioritize_dmmv = ggml_sycl_get_env("GGML_SYCL_PRIORITIZE_DMMV", 0); + g_ggml_sycl_dev2dev_memcpy = ggml_sycl_get_env("GGML_SYCL_DEV2DEV_MEMCPY", DEV2DEV_MEMCPY_SYCL); + if (g_ggml_sycl_enable_level_zero == 0) { + g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL; + } + #ifdef SYCL_FLASH_ATTN g_ggml_sycl_enable_flash_attention = ggml_sycl_get_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1); #else @@ -324,8 +330,11 @@ static void ggml_check_sycl() try { #endif #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero); + GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d\n", g_ggml_sycl_dev2dev_memcpy); #else GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n"); + GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO\n", + g_ggml_sycl_dev2dev_memcpy); #endif #if GGML_SYCL_DNNL GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn); @@ -598,27 +607,42 @@ static bool ggml_sycl_is_l0_discrete_gpu(int device) { static void dev2dev_memcpy(int device_dst, sycl::queue &q_dst, int device_src, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { + #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - // Use Level Zero direct copy for dGPU-to-dGPU transfers. - const bool l0_copy_supported = g_ggml_sycl_enable_level_zero && - ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src); - if (l0_copy_supported) { - auto ze_ctx = sycl::get_native(q_dst.get_context()); - auto ze_dev = sycl::get_native(q_dst.get_device()); - ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0, - 0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; - ze_command_list_handle_t cl; - ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl); - if (r == ZE_RESULT_SUCCESS) { - r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr); - zeCommandListDestroy(cl); + if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_L0) { + // Use Level Zero direct copy for dGPU-to-dGPU transfers. + const bool l0_copy_supported = + ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src); + if (g_ggml_sycl_enable_level_zero && l0_copy_supported) { + auto ze_ctx = sycl::get_native(q_dst.get_context()); + auto ze_dev = sycl::get_native(q_dst.get_device()); + ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0, + 0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; + ze_command_list_handle_t cl; + ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl); if (r == ZE_RESULT_SUCCESS) { - return; + GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by L0\n"); + r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr); + zeCommandListDestroy(cl); + if (r == ZE_RESULT_SUCCESS) { + return; + } } } } #endif + + if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_SYCL) { + if (q_dst.get_device().ext_oneapi_can_access_peer(q_src.get_device(), + sycl::ext::oneapi::peer_access::access_supported)) { + GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by SYCL\n"); + SYCL_CHECK(CHECK_TRY_ERROR(q_dst.memcpy(ptr_dst, ptr_src, size).wait())); + return; + } + } + // Host-staged copy + GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by host forward\n"); char *host_buf = (char *)malloc(size); q_src.memcpy(host_buf, (const char *)ptr_src, size).wait(); q_dst.memcpy((char *)ptr_dst, host_buf, size).wait();