mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
ggml-cpu: fix SVE leftover path in ggml_vec_dot_f32 (#24699)
* ggml-cpu: fix SVE leftover path in ggml_vec_dot_f32 2D convolutions with kernel size 9 produced different results on SVE enabled ARM devices. After debugging it turned out that ggml_vec_dot_f32 was using data from inactive lanes. Use svmla_f32_m(pg, sum1, ax1, ay1) so inactive lanes retain sum1. * cont : clean-up --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
1a87dcdc45
commit
c16c35b814
@ -75,12 +75,12 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
||||
}
|
||||
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
||||
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmla on available elements only
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b32(np2, n);
|
||||
ax1 = svld1_f32(pg, x + np2);
|
||||
ay1 = svld1_f32(pg, y + np2);
|
||||
sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
|
||||
sum1 = svmla_f32_m(pg, sum1, ax1, ay1);
|
||||
}
|
||||
// reduce sum1,sum2 to sum1
|
||||
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
|
||||
|
||||
@ -102,33 +102,41 @@ static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_tr
|
||||
return fabsf(result - dot_ref) / test_size;
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
bool verbose = false;
|
||||
static int test_vec_dot_f32(bool verbose) {
|
||||
const auto * f32 = ggml_get_type_traits_cpu(GGML_TYPE_F32);
|
||||
int num_failed = 0;
|
||||
for (int n : {1, 2, 3, 5, 7, 8, 15, 16, 17, 31, 33, 63, 67, 127, 129, 193, 255, 1023}) {
|
||||
std::vector<float> a(n);
|
||||
std::vector<float> b(n);
|
||||
generate_data(0.0, n, a.data());
|
||||
generate_data(1.0, n, b.data());
|
||||
|
||||
float result = 0.0f;
|
||||
f32->vec_dot(n, &result, 0, a.data(), 0, b.data(), 0, 1);
|
||||
const float ref = dot_product(a.data(), b.data(), n);
|
||||
const float error = fabsf(result - ref) / n;
|
||||
|
||||
const bool failed = !(error < MAX_QUANTIZATION_REFERENCE_ERROR);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
printf(" f32 vec_dot n=%4d: %s (ref=%f got=%f err=%f)\n",
|
||||
n, RESULT_STR[failed], ref, result, error);
|
||||
}
|
||||
}
|
||||
return num_failed;
|
||||
}
|
||||
|
||||
static int test_vec_dot_q(bool verbose) {
|
||||
int num_failed = 0;
|
||||
|
||||
const size_t test_size = 32 * 128;
|
||||
|
||||
std::string arg;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
||||
if (arg == "-v") {
|
||||
verbose = true;
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<float> test_data(test_size);
|
||||
std::vector<float> test_data2(test_size);
|
||||
|
||||
generate_data(0.0, test_data.size(), test_data.data());
|
||||
generate_data(1.0, test_data2.size(), test_data2.data());
|
||||
|
||||
ggml_cpu_init();
|
||||
|
||||
int num_failed = 0;
|
||||
bool failed = false;
|
||||
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
const auto * qfns = ggml_get_type_traits(type);
|
||||
@ -156,7 +164,7 @@ int main(int argc, char * argv[]) {
|
||||
type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||
type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS :
|
||||
type == GGML_TYPE_NVFP4 ? MAX_QUANTIZATION_TOTAL_ERROR_FP4 : MAX_QUANTIZATION_TOTAL_ERROR;
|
||||
failed = !(total_error < max_quantization_error);
|
||||
bool failed = !(total_error < max_quantization_error);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
|
||||
@ -188,6 +196,31 @@ int main(int argc, char * argv[]) {
|
||||
}
|
||||
}
|
||||
|
||||
return num_failed;
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
bool verbose = false;
|
||||
|
||||
std::string arg;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
||||
if (arg == "-v") {
|
||||
verbose = true;
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_cpu_init();
|
||||
|
||||
int num_failed = 0;
|
||||
|
||||
num_failed += test_vec_dot_f32(verbose);
|
||||
num_failed += test_vec_dot_q(verbose);
|
||||
|
||||
if (num_failed || verbose) {
|
||||
printf("%d tests failed\n", num_failed);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user