From 9724f664e803e70eb8d046a3fac411122ad42ff7 Mon Sep 17 00:00:00 2001 From: Neo Zhang Date: Thu, 18 Jun 2026 16:18:26 +0800 Subject: [PATCH] [SYCL] rename GGML_SYCL_SUPPORT_LEVEL_ZERO (#24719) * rename GGML_SYCL_SUPPORT_LEVEL_ZERO to GGML_SYCL_SUPPORT_LEVEL_ZERO_API, and GGML_SYCL_ENABLE_LEVEL_ZERO to GGML_SYCL_USE_LEVEL_ZERO_API * fix code format * fix error when rebase --- docs/backend/SYCL.md | 4 ++-- ggml/CMakeLists.txt | 2 +- ggml/src/ggml-sycl/CMakeLists.txt | 10 ++++----- ggml/src/ggml-sycl/common.cpp | 10 ++++----- ggml/src/ggml-sycl/common.hpp | 2 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 36 +++++++++++++++---------------- 6 files changed, 32 insertions(+), 32 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 68f6e60e6c..d482d88408 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -759,7 +759,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). | | GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. | | GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. | -| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. | +| GGML_SYCL_SUPPORT_LEVEL_ZERO_API | ON *(default)* \|OFF *(Optional)* | Support to use Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).| | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | @@ -774,7 +774,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.| | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for Intel devices older than Gen 10) | | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. | -| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. | +| GGML_SYCL_USE_LEVEL_ZERO_API | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO_API=ON at build time. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).| | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. | | GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. | | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer | diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 249ed3da29..0507e0c5aa 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -249,7 +249,7 @@ option(GGML_SYCL "ggml: use SYCL" option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON) option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON) -option(GGML_SYCL_SUPPORT_LEVEL_ZERO "ggml: use Level Zero API in SYCL backend" ON) +option(GGML_SYCL_SUPPORT_LEVEL_ZERO_API "ggml: use Level Zero API in SYCL backend" ON) option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON) set (GGML_SYCL_TARGET "INTEL" CACHE STRING "ggml: sycl target device") diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 180de92202..1c17d20df1 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -39,8 +39,8 @@ if (WIN32) set(CMAKE_CXX_COMPILER "icx") set(CMAKE_CXX_COMPILER_ID "IntelLLVM") endif() - # Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO is enabled) - if(GGML_SYCL_SUPPORT_LEVEL_ZERO) + # Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO_API is enabled) + if(GGML_SYCL_SUPPORT_LEVEL_ZERO_API) if(DEFINED ENV{LEVEL_ZERO_V1_SDK_PATH}) set(LEVEL_ZERO_V1_SDK_PATH $ENV{LEVEL_ZERO_V1_SDK_PATH}) if(EXISTS "${LEVEL_ZERO_V1_SDK_PATH}") @@ -105,8 +105,8 @@ endif() target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing") -message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO ${GGML_SYCL_SUPPORT_LEVEL_ZERO}") -if (GGML_SYCL_SUPPORT_LEVEL_ZERO) +message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO_API ${GGML_SYCL_SUPPORT_LEVEL_ZERO_API}") +if (GGML_SYCL_SUPPORT_LEVEL_ZERO_API) # Link against Level Zero loader for direct device memory allocation. # Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging # in the xe kernel driver during multi-GPU inference. @@ -114,7 +114,7 @@ if (GGML_SYCL_SUPPORT_LEVEL_ZERO) find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ${LEVEL_ZERO_V1_SDK_LIB_PATH} ENV LD_LIBRARY_PATH) if(ZE_LOADER_LIB AND LEVEL_ZERO_INCLUDE_DIR) target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB}) - target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO) + target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO_API) message(STATUS "Level Zero loader found: ${ZE_LOADER_LIB}") message(STATUS "Level Zero headers found: ${LEVEL_ZERO_INCLUDE_DIR}") else() diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 38ace8bf5e..e1b6db13eb 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -12,7 +12,7 @@ #include "common.hpp" #include -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API #include #endif @@ -84,9 +84,9 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block return sycl_down_blk_size; } -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) { - return g_ggml_sycl_enable_level_zero && + return g_ggml_sycl_use_level_zero_api && q.get_device().is_gpu() && q.get_backend() == sycl::backend::ext_oneapi_level_zero; } @@ -95,7 +95,7 @@ static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) { // Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering // DMA-buf/TTM system RAM staging in the xe kernel driver during multi-GPU inference. void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API if (ggml_sycl_use_level_zero_device_alloc(q)) { void *ptr = nullptr; auto ze_ctx = sycl::get_native(q.get_context()); @@ -127,7 +127,7 @@ void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { void ggml_sycl_free_device(void *ptr, sycl::queue &q) { if (!ptr) return; -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API if (ggml_sycl_use_level_zero_device_alloc(q)) { auto ze_ctx = sycl::get_native(q.get_context()); zeMemFree(ze_ctx, ptr); diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index c87a4636e2..8534bd3581 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -324,7 +324,7 @@ struct ggml_tensor_extra_gpu { optimize_feature optimized_feature; }; -extern int g_ggml_sycl_enable_level_zero; +extern int g_ggml_sycl_use_level_zero_api; void * ggml_sycl_malloc_device(size_t size, sycl::queue &q); void ggml_sycl_free_device(void *ptr, sycl::queue &q); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 0aebdc4413..d8b83d0e23 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -32,7 +32,7 @@ #include #include -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API #include #endif #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC @@ -87,7 +87,7 @@ int g_ggml_sycl_enable_vmm = 1; int g_ggml_sycl_prioritize_dmmv = 0; int g_ggml_sycl_use_async_mem_op = 0; int g_ggml_sycl_use_async_mem_op_requested = 1; -int g_ggml_sycl_enable_level_zero = 0; +int g_ggml_sycl_use_level_zero_api = 0; int g_ggml_sycl_enable_flash_attention = 1; int g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL; int g_ggml_sycl_usm_system = 0; @@ -157,7 +157,7 @@ static ggml_sycl_device_info ggml_sycl_init() { info.ext_oneapi_level_zero = false; } -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API if (info.ext_oneapi_level_zero && device.is_gpu() && device.default_queue().get_backend() == sycl::backend::ext_oneapi_level_zero) { ze_device_handle_t ze_dev = sycl::get_native(device.default_queue().get_device()); ze_device_properties_t props = {}; @@ -172,13 +172,13 @@ static ggml_sycl_device_info ggml_sycl_init() { info.default_tensor_split[id] /= total_vram; } -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API // Large buffers can be allocated before ggml_check_sycl() initializes other // g_ggml_sycl_enable_* globals, so initialize this one as early as we can. - g_ggml_sycl_enable_level_zero = - info.ext_oneapi_level_zero && ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1); + g_ggml_sycl_use_level_zero_api = + info.ext_oneapi_level_zero && ggml_sycl_get_env("GGML_SYCL_USE_LEVEL_ZERO_API", 1); #else - g_ggml_sycl_enable_level_zero = 0; + g_ggml_sycl_use_level_zero_api = 0; #endif return info; @@ -277,7 +277,7 @@ static void ggml_check_sycl() try { g_ggml_sycl_prioritize_dmmv = ggml_sycl_get_env("GGML_SYCL_PRIORITIZE_DMMV", 0); g_ggml_sycl_dev2dev_memcpy = ggml_sycl_get_env("GGML_SYCL_DEV2DEV_MEMCPY", DEV2DEV_MEMCPY_SYCL); - if (g_ggml_sycl_enable_level_zero == 0) { + if (g_ggml_sycl_use_level_zero_api == 0) { g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL; } @@ -312,10 +312,10 @@ static void ggml_check_sycl() try { #else GGML_LOG_INFO(" GGML_SYCL_DNNL: no\n"); #endif -#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) - GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: yes\n"); +#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO_API) + GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO_API: yes\n"); #else - GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n"); + GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO_API: no\n"); #endif #if defined(GGML_SYCL_USE_VMM) GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n"); @@ -331,12 +331,12 @@ static void ggml_check_sycl() try { #else GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n"); #endif -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero); +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API + GGML_LOG_INFO(" GGML_SYCL_USE_LEVEL_ZERO_API: %d\n", g_ggml_sycl_use_level_zero_api); GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d\n", g_ggml_sycl_dev2dev_memcpy); #else - GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n"); - GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO\n", + GGML_LOG_INFO(" GGML_SYCL_USE_LEVEL_ZERO_API: Disable Level Zero API usage by compile flag\n"); + GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO_API\n", g_ggml_sycl_dev2dev_memcpy); #endif #if GGML_SYCL_DNNL @@ -602,7 +602,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API static bool ggml_sycl_is_l0_discrete_gpu(int device) { return ggml_sycl_info().devices[device].l0_discrete_gpu; } @@ -611,12 +611,12 @@ static bool ggml_sycl_is_l0_discrete_gpu(int device) { static void dev2dev_memcpy(int device_dst, sycl::queue &q_dst, int device_src, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_L0) { // Use Level Zero direct copy for dGPU-to-dGPU transfers. const bool l0_copy_supported = ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src); - if (g_ggml_sycl_enable_level_zero && l0_copy_supported) { + if (g_ggml_sycl_use_level_zero_api && l0_copy_supported) { auto ze_ctx = sycl::get_native(q_dst.get_context()); auto ze_dev = sycl::get_native(q_dst.get_device()); ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,