From 7cacf28eecec3f702200de8671ebcc6a3844c6c4 Mon Sep 17 00:00:00 2001 From: Nexes the Elder <124105151+Nexesenex@users.noreply.github.com> Date: Wed, 24 Jun 2026 09:09:33 +0200 Subject: [PATCH] Fix minor GGML discrepencies (#2016) * fix: wrong stride in batched quantized add1 (nb0 -> nb3) ggml_compute_forward_add1_q_f32 used i3*nb0 (element stride) instead of i3*nb3 (batch stride) for the destination row pointer. This causes all add1 operations with quantized types and batch > 1 to write to wrong memory locations. The src0 pointer on the line above correctly uses nb03. * fix: wrong dimension limits in dup_f16 non-contiguous path The destination index wrapping in ggml_compute_forward_dup_f16 used source dimensions (ne00/ne01/ne02/ne03) instead of destination dimensions (ne0/ne1/ne2/ne3). While source and destination shapes are currently identical for dup, using the wrong variables is incorrect by design. * fix: wrong dimension limits in dup_bf16 non-contiguous path Same fix as the dup_f16 path: destination index wrapping used source dimensions (ne00/ne01/ne02/ne03) instead of destination dimensions (ne0/ne1/ne2/ne3). Copy-paste error from the contiguous path. * fix: ACC work size uses src[1] instead of src[0] The dequantization work buffer for quantized ACC was sized using src[1]->ne[0] instead of src[0]->ne[0]. Since src[0] is the tensor being dequantized, its dimensions should determine the buffer size. * fix: missing work size for SOFT_CAP_MAX and ROPE_BACK Both ops dereference params->wdata in their forward functions but had no work size allocation (cur = 0), causing a NULL pointer dereference when any thread attempted to use wdata. * fix: wrong dim in sum_rows_f32 dimension decomposition Line 14404 used ne01*ne0 (= ne01*1) instead of ne01*ne02 for the i3 term in the flat row index formula. When ne02 > 1 (batched 2D inputs), this causes wrong memory access and corrupted results. --- ggml/src/ggml.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 526e1b4d..f3b59e9e 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -11415,13 +11415,13 @@ static void ggml_compute_forward_dup_f16( memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t)); - if (++i10 == ne00) { + if (++i10 == ne0) { i10 = 0; - if (++i11 == ne01) { + if (++i11 == ne1) { i11 = 0; - if (++i12 == ne02) { + if (++i12 == ne2) { i12 = 0; - if (++i13 == ne03) { + if (++i13 == ne3) { i13 = 0; } } @@ -11719,13 +11719,13 @@ static void ggml_compute_forward_dup_bf16( memcpy(dst_ptr, src0_ptr, sizeof(ggml_bf16_t)); - if (++i10 == ne00) { + if (++i10 == ne0) { i10 = 0; - if (++i11 == ne01) { + if (++i11 == ne1) { i11 = 0; - if (++i12 == ne02) { + if (++i12 == ne2) { i12 = 0; - if (++i13 == ne03) { + if (++i13 == ne3) { i13 = 0; } } @@ -13341,7 +13341,7 @@ static void ggml_compute_forward_add1_q_f32( const int i1 = (ir - i3*ne2*ne1 - i2*ne1); void * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03)); - void * dst_row = (void *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb0 )); + void * dst_row = (void *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3 )); assert(ne0 % 32 == 0); @@ -14401,7 +14401,7 @@ static void ggml_compute_forward_sum_rows_f32( for (int ir = first_row; ir < last_row; ++ir) { int i3 = ir / (ne01*ne02); int i2 = (ir - i3*ne01*ne02)/ne01; - int i1 = ir - i3*ne01*ne0 - i2*ne01; + int i1 = ir - i3*ne01*ne02 - i2*ne01; const float * src_row = (const float *)((const char *)src0->data + i1*nb01 + i2*nb02 + i3*nb03); float * dst_row = ( float *)(( char *)dst->data + i1*nb1 + i2*nb2 + i3*nb3); float row_sum = 0; @@ -26632,7 +26632,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa case GGML_OP_ACC: { if (ggml_is_quantized(node->src[0]->type)) { - cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; + cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; } } break; case GGML_OP_MUL_MAT: @@ -26692,7 +26692,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa } } break; case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_CAP_MAX: case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: { cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; } break;