mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
vulkan: add INTEL_XE1 arch enum and enable coopmat1 on Intel Xe-LPG Plus (#24404)
* vulkan: add INTEL_PRE_XE2 arch enum and enable coopmat1 on Intel Xe-LPG Plus (1/3, Xe1-ARLH) Co-authored-by: Xia, Jie <jie.xia@intel.com> Co-authored-by: Liu, Russell <russell.liu@intel.com> * Address comments of bf16 and trailing whitespace * Rename INTEL_PRE_XE2 to INTEL_XE1 and remove driver workaround * Add Windows driver check --------- Co-authored-by: Xia, Jie <jie.xia@intel.com> Co-authored-by: Liu, Russell <russell.liu@intel.com>
This commit is contained in:
parent
ded1561b42
commit
5a6a0dd7e1
@ -308,6 +308,7 @@ enum vk_device_architecture {
|
|||||||
AMD_RDNA1,
|
AMD_RDNA1,
|
||||||
AMD_RDNA2,
|
AMD_RDNA2,
|
||||||
AMD_RDNA3,
|
AMD_RDNA3,
|
||||||
|
INTEL_XE1,
|
||||||
INTEL_XE2,
|
INTEL_XE2,
|
||||||
NVIDIA_PRE_TURING,
|
NVIDIA_PRE_TURING,
|
||||||
NVIDIA_TURING,
|
NVIDIA_TURING,
|
||||||
@ -365,21 +366,26 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
|
|||||||
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
||||||
|
|
||||||
bool subgroup_size_control = false;
|
bool subgroup_size_control = false;
|
||||||
|
bool integer_dot_product = false;
|
||||||
|
|
||||||
for (const auto& properties : ext_props) {
|
for (const auto& properties : ext_props) {
|
||||||
if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
|
if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
|
||||||
subgroup_size_control = true;
|
subgroup_size_control = true;
|
||||||
|
} else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) {
|
||||||
|
integer_dot_product = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!subgroup_size_control) {
|
if (!subgroup_size_control || !integer_dot_product) {
|
||||||
return vk_device_architecture::OTHER;
|
return vk_device_architecture::OTHER;
|
||||||
}
|
}
|
||||||
|
|
||||||
vk::PhysicalDeviceProperties2 props2;
|
vk::PhysicalDeviceProperties2 props2;
|
||||||
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
||||||
|
vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props;
|
||||||
|
|
||||||
props2.pNext = &subgroup_size_control_props;
|
props2.pNext = &subgroup_size_control_props;
|
||||||
|
subgroup_size_control_props.pNext = &integer_dot_props;
|
||||||
device.getProperties2(&props2);
|
device.getProperties2(&props2);
|
||||||
|
|
||||||
if (subgroup_size_control_props.minSubgroupSize == 16) {
|
if (subgroup_size_control_props.minSubgroupSize == 16) {
|
||||||
@ -388,6 +394,9 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
|
|||||||
// https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html
|
// https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html
|
||||||
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
|
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
|
||||||
return vk_device_architecture::INTEL_XE2;
|
return vk_device_architecture::INTEL_XE2;
|
||||||
|
} else if (subgroup_size_control_props.minSubgroupSize == 8 &&
|
||||||
|
integer_dot_product && integer_dot_props.integerDotProduct4x8BitPackedSignedAccelerated) {
|
||||||
|
return vk_device_architecture::INTEL_XE1;
|
||||||
}
|
}
|
||||||
} else if (props.vendorID == VK_VENDOR_ID_NVIDIA) {
|
} else if (props.vendorID == VK_VENDOR_ID_NVIDIA) {
|
||||||
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
||||||
@ -3837,7 +3846,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
|||||||
l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||||
l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||||
l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 };
|
l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 };
|
||||||
} else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) {
|
} else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support) {
|
||||||
// Xe2/Xe3 with coopmat enabled - warptile performance tuning
|
// Xe2/Xe3 with coopmat enabled - warptile performance tuning
|
||||||
l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||||
l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||||
@ -6361,9 +6370,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||||||
break;
|
break;
|
||||||
case VK_VENDOR_ID_INTEL: {
|
case VK_VENDOR_ID_INTEL: {
|
||||||
// Current Windows driver does not expose BF16 support.
|
// Current Windows driver does not expose BF16 support.
|
||||||
// We only want to use l_warptile if coopmat is available and is Xe2+
|
// We only want to use l_warptile if coopmat is available
|
||||||
const bool xe2_with_coopmat = device->coopmat_support && device->architecture == INTEL_XE2;
|
const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && device->coopmat_support) : device->coopmat_support;
|
||||||
const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && xe2_with_coopmat) : xe2_with_coopmat;
|
|
||||||
device->mul_mat_l[i] = use_l_warptile;
|
device->mul_mat_l[i] = use_l_warptile;
|
||||||
device->mul_mat_id_l[i] = use_l_warptile;
|
device->mul_mat_id_l[i] = use_l_warptile;
|
||||||
device->mul_mat_m[i] = true;
|
device->mul_mat_m[i] = true;
|
||||||
@ -17890,9 +17898,9 @@ static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) {
|
|||||||
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
|
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
|
||||||
switch (props.vendorID) {
|
switch (props.vendorID) {
|
||||||
case VK_VENDOR_ID_INTEL:
|
case VK_VENDOR_ID_INTEL:
|
||||||
// Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost,
|
// Only allowing Xe2/Xe3 GPU and integrated Xe GPUs at the moment since older hardware (ex. Arc A770) has performance regressions.
|
||||||
// while some older hardware (ex. Arc A770) has performance regressions
|
return (arch == vk_device_architecture::INTEL_XE2) ||
|
||||||
return arch == vk_device_architecture::INTEL_XE2;
|
(arch == vk_device_architecture::INTEL_XE1 && props.deviceType == vk::PhysicalDeviceType::eIntegratedGpu && driver_props.driverID == vk::DriverId::eIntelProprietaryWindows);
|
||||||
case VK_VENDOR_ID_AMD:
|
case VK_VENDOR_ID_AMD:
|
||||||
if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
|
if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
|
||||||
// Workaround for AMD proprietary driver reporting support on all GPUs
|
// Workaround for AMD proprietary driver reporting support on all GPUs
|
||||||
@ -17940,6 +17948,8 @@ static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev)
|
|||||||
case 0xE20B: // B580
|
case 0xE20B: // B580
|
||||||
case 0xE211: // Pro B60
|
case 0xE211: // Pro B60
|
||||||
return 20;
|
return 20;
|
||||||
|
case 0xB080: // PTL Xe3 LPG 2x6 (12 subslices)
|
||||||
|
return 12;
|
||||||
default:
|
default:
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user