From 5a6a0dd7e1f7342bac4618a223fc21f9cece3b48 Mon Sep 17 00:00:00 2001 From: "Jiang, Fish" Date: Fri, 26 Jun 2026 11:26:22 +0000 Subject: [PATCH] vulkan: add INTEL_XE1 arch enum and enable coopmat1 on Intel Xe-LPG Plus (#24404) * vulkan: add INTEL_PRE_XE2 arch enum and enable coopmat1 on Intel Xe-LPG Plus (1/3, Xe1-ARLH) Co-authored-by: Xia, Jie Co-authored-by: Liu, Russell * Address comments of bf16 and trailing whitespace * Rename INTEL_PRE_XE2 to INTEL_XE1 and remove driver workaround * Add Windows driver check --------- Co-authored-by: Xia, Jie Co-authored-by: Liu, Russell --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 5fbebc6d75..c090d44420 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -308,6 +308,7 @@ enum vk_device_architecture { AMD_RDNA1, AMD_RDNA2, AMD_RDNA3, + INTEL_XE1, INTEL_XE2, NVIDIA_PRE_TURING, NVIDIA_TURING, @@ -365,21 +366,26 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& const std::vector ext_props = device.enumerateDeviceExtensionProperties(); bool subgroup_size_control = false; + bool integer_dot_product = false; for (const auto& properties : ext_props) { if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { subgroup_size_control = true; + } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) { + integer_dot_product = true; } } - if (!subgroup_size_control) { + if (!subgroup_size_control || !integer_dot_product) { return vk_device_architecture::OTHER; } vk::PhysicalDeviceProperties2 props2; vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; + vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props; props2.pNext = &subgroup_size_control_props; + subgroup_size_control_props.pNext = &integer_dot_props; device.getProperties2(&props2); if (subgroup_size_control_props.minSubgroupSize == 16) { @@ -388,6 +394,9 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& // https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html return vk_device_architecture::INTEL_XE2; + } else if (subgroup_size_control_props.minSubgroupSize == 8 && + integer_dot_product && integer_dot_props.integerDotProduct4x8BitPackedSignedAccelerated) { + return vk_device_architecture::INTEL_XE1; } } else if (props.vendorID == VK_VENDOR_ID_NVIDIA) { const std::vector ext_props = device.enumerateDeviceExtensionProperties(); @@ -3837,7 +3846,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 }; - } else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) { + } else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support) { // Xe2/Xe3 with coopmat enabled - warptile performance tuning l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; @@ -6361,9 +6370,8 @@ static vk_device ggml_vk_get_device(size_t idx) { break; case VK_VENDOR_ID_INTEL: { // Current Windows driver does not expose BF16 support. - // We only want to use l_warptile if coopmat is available and is Xe2+ - const bool xe2_with_coopmat = device->coopmat_support && device->architecture == INTEL_XE2; - const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && xe2_with_coopmat) : xe2_with_coopmat; + // We only want to use l_warptile if coopmat is available + const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && device->coopmat_support) : device->coopmat_support; device->mul_mat_l[i] = use_l_warptile; device->mul_mat_id_l[i] = use_l_warptile; device->mul_mat_m[i] = true; @@ -17890,9 +17898,9 @@ static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) { static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) { switch (props.vendorID) { case VK_VENDOR_ID_INTEL: - // Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost, - // while some older hardware (ex. Arc A770) has performance regressions - return arch == vk_device_architecture::INTEL_XE2; + // Only allowing Xe2/Xe3 GPU and integrated Xe GPUs at the moment since older hardware (ex. Arc A770) has performance regressions. + return (arch == vk_device_architecture::INTEL_XE2) || + (arch == vk_device_architecture::INTEL_XE1 && props.deviceType == vk::PhysicalDeviceType::eIntegratedGpu && driver_props.driverID == vk::DriverId::eIntelProprietaryWindows); case VK_VENDOR_ID_AMD: if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) { // Workaround for AMD proprietary driver reporting support on all GPUs @@ -17940,6 +17948,8 @@ static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev) case 0xE20B: // B580 case 0xE211: // Pro B60 return 20; + case 0xB080: // PTL Xe3 LPG 2x6 (12 subslices) + return 12; default: return 0; }