vulkan: add INTEL_XE1 arch enum and enable coopmat1 on Intel Xe-LPG Plus (#24404)

* vulkan: add INTEL_PRE_XE2 arch enum and enable coopmat1 on Intel Xe-LPG Plus (1/3, Xe1-ARLH)

Co-authored-by: Xia, Jie <jie.xia@intel.com>
Co-authored-by: Liu, Russell <russell.liu@intel.com>

* Address comments of bf16 and trailing whitespace

* Rename INTEL_PRE_XE2 to INTEL_XE1 and remove driver workaround

* Add Windows driver check

---------

Co-authored-by: Xia, Jie <jie.xia@intel.com>
Co-authored-by: Liu, Russell <russell.liu@intel.com>
This commit is contained in:
Jiang, Fish 2026-06-26 11:26:22 +00:00 committed by GitHub
parent ded1561b42
commit 5a6a0dd7e1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -308,6 +308,7 @@ enum vk_device_architecture {
AMD_RDNA1, AMD_RDNA1,
AMD_RDNA2, AMD_RDNA2,
AMD_RDNA3, AMD_RDNA3,
INTEL_XE1,
INTEL_XE2, INTEL_XE2,
NVIDIA_PRE_TURING, NVIDIA_PRE_TURING,
NVIDIA_TURING, NVIDIA_TURING,
@ -365,21 +366,26 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties(); const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
bool subgroup_size_control = false; bool subgroup_size_control = false;
bool integer_dot_product = false;
for (const auto& properties : ext_props) { for (const auto& properties : ext_props) {
if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
subgroup_size_control = true; subgroup_size_control = true;
} else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) {
integer_dot_product = true;
} }
} }
if (!subgroup_size_control) { if (!subgroup_size_control || !integer_dot_product) {
return vk_device_architecture::OTHER; return vk_device_architecture::OTHER;
} }
vk::PhysicalDeviceProperties2 props2; vk::PhysicalDeviceProperties2 props2;
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props;
props2.pNext = &subgroup_size_control_props; props2.pNext = &subgroup_size_control_props;
subgroup_size_control_props.pNext = &integer_dot_props;
device.getProperties2(&props2); device.getProperties2(&props2);
if (subgroup_size_control_props.minSubgroupSize == 16) { if (subgroup_size_control_props.minSubgroupSize == 16) {
@ -388,6 +394,9 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
// https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html // https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
return vk_device_architecture::INTEL_XE2; return vk_device_architecture::INTEL_XE2;
} else if (subgroup_size_control_props.minSubgroupSize == 8 &&
integer_dot_product && integer_dot_props.integerDotProduct4x8BitPackedSignedAccelerated) {
return vk_device_architecture::INTEL_XE1;
} }
} else if (props.vendorID == VK_VENDOR_ID_NVIDIA) { } else if (props.vendorID == VK_VENDOR_ID_NVIDIA) {
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties(); const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
@ -3837,7 +3846,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 }; l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 };
} else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) { } else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support) {
// Xe2/Xe3 with coopmat enabled - warptile performance tuning // Xe2/Xe3 with coopmat enabled - warptile performance tuning
l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
@ -6361,9 +6370,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
break; break;
case VK_VENDOR_ID_INTEL: { case VK_VENDOR_ID_INTEL: {
// Current Windows driver does not expose BF16 support. // Current Windows driver does not expose BF16 support.
// We only want to use l_warptile if coopmat is available and is Xe2+ // We only want to use l_warptile if coopmat is available
const bool xe2_with_coopmat = device->coopmat_support && device->architecture == INTEL_XE2; const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && device->coopmat_support) : device->coopmat_support;
const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && xe2_with_coopmat) : xe2_with_coopmat;
device->mul_mat_l[i] = use_l_warptile; device->mul_mat_l[i] = use_l_warptile;
device->mul_mat_id_l[i] = use_l_warptile; device->mul_mat_id_l[i] = use_l_warptile;
device->mul_mat_m[i] = true; device->mul_mat_m[i] = true;
@ -17890,9 +17898,9 @@ static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) {
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) { static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
switch (props.vendorID) { switch (props.vendorID) {
case VK_VENDOR_ID_INTEL: case VK_VENDOR_ID_INTEL:
// Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost, // Only allowing Xe2/Xe3 GPU and integrated Xe GPUs at the moment since older hardware (ex. Arc A770) has performance regressions.
// while some older hardware (ex. Arc A770) has performance regressions return (arch == vk_device_architecture::INTEL_XE2) ||
return arch == vk_device_architecture::INTEL_XE2; (arch == vk_device_architecture::INTEL_XE1 && props.deviceType == vk::PhysicalDeviceType::eIntegratedGpu && driver_props.driverID == vk::DriverId::eIntelProprietaryWindows);
case VK_VENDOR_ID_AMD: case VK_VENDOR_ID_AMD:
if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) { if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
// Workaround for AMD proprietary driver reporting support on all GPUs // Workaround for AMD proprietary driver reporting support on all GPUs
@ -17940,6 +17948,8 @@ static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev)
case 0xE20B: // B580 case 0xE20B: // B580
case 0xE211: // Pro B60 case 0xE211: // Pro B60
return 20; return 20;
case 0xB080: // PTL Xe3 LPG 2x6 (12 subslices)
return 12;
default: default:
return 0; return 0;
} }