From 7cacf28eecec3f702200de8671ebcc6a3844c6c4 Mon Sep 17 00:00:00 2001
From: Nexes the Elder <124105151+Nexesenex@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:09:33 +0200
Subject: [PATCH] Fix minor GGML discrepencies (#2016)

* fix: wrong stride in batched quantized add1 (nb0 -> nb3)

ggml_compute_forward_add1_q_f32 used i3*nb0 (element stride) instead of
i3*nb3 (batch stride) for the destination row pointer. This causes all
add1 operations with quantized types and batch > 1 to write to wrong
memory locations. The src0 pointer on the line above correctly uses nb03.

* fix: wrong dimension limits in dup_f16 non-contiguous path

The destination index wrapping in ggml_compute_forward_dup_f16 used
source dimensions (ne00/ne01/ne02/ne03) instead of destination dimensions
(ne0/ne1/ne2/ne3). While source and destination shapes are currently
identical for dup, using the wrong variables is incorrect by design.

* fix: wrong dimension limits in dup_bf16 non-contiguous path

Same fix as the dup_f16 path: destination index wrapping used source
dimensions (ne00/ne01/ne02/ne03) instead of destination dimensions
(ne0/ne1/ne2/ne3). Copy-paste error from the contiguous path.

* fix: ACC work size uses src[1] instead of src[0]

The dequantization work buffer for quantized ACC was sized using
src[1]->ne[0] instead of src[0]->ne[0]. Since src[0] is the tensor
being dequantized, its dimensions should determine the buffer size.

* fix: missing work size for SOFT_CAP_MAX and ROPE_BACK

Both ops dereference params->wdata in their forward functions but had
no work size allocation (cur = 0), causing a NULL pointer dereference
when any thread attempted to use wdata.

* fix: wrong dim in sum_rows_f32 dimension decomposition

Line 14404 used ne01*ne0 (= ne01*1) instead of ne01*ne02 for the
i3 term in the flat row index formula. When ne02 > 1 (batched 2D
inputs), this causes wrong memory access and corrupted results.
---
 ggml/src/ggml.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 526e1b4d..f3b59e9e 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -11415,13 +11415,13 @@ static void ggml_compute_forward_dup_f16(
 
                         memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
 
-                        if (++i10 == ne00) {
+                        if (++i10 == ne0) {
                             i10 = 0;
-                            if (++i11 == ne01) {
+                            if (++i11 == ne1) {
                                 i11 = 0;
-                                if (++i12 == ne02) {
+                                if (++i12 == ne2) {
                                     i12 = 0;
-                                    if (++i13 == ne03) {
+                                    if (++i13 == ne3) {
                                         i13 = 0;
                                     }
                                 }
@@ -11719,13 +11719,13 @@ static void ggml_compute_forward_dup_bf16(
 
                         memcpy(dst_ptr, src0_ptr, sizeof(ggml_bf16_t));
 
-                        if (++i10 == ne00) {
+                        if (++i10 == ne0) {
                             i10 = 0;
-                            if (++i11 == ne01) {
+                            if (++i11 == ne1) {
                                 i11 = 0;
-                                if (++i12 == ne02) {
+                                if (++i12 == ne2) {
                                     i12 = 0;
-                                    if (++i13 == ne03) {
+                                    if (++i13 == ne3) {
                                         i13 = 0;
                                     }
                                 }
@@ -13341,7 +13341,7 @@ static void ggml_compute_forward_add1_q_f32(
         const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 
         void  * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03));
-        void  * dst_row  = (void *) ((char *)  dst->data + (i1*nb1  + i2*nb2  + i3*nb0 ));
+        void  * dst_row  = (void *) ((char *)  dst->data + (i1*nb1  + i2*nb2  + i3*nb3 ));
 
         assert(ne0 % 32 == 0);
 
@@ -14401,7 +14401,7 @@ static void ggml_compute_forward_sum_rows_f32(
     for (int ir = first_row; ir < last_row; ++ir) {
         int i3 = ir / (ne01*ne02);
         int i2 = (ir - i3*ne01*ne02)/ne01;
-        int i1 = ir - i3*ne01*ne0 - i2*ne01;
+        int i1 = ir - i3*ne01*ne02 - i2*ne01;
         const float * src_row = (const float *)((const char *)src0->data + i1*nb01 + i2*nb02 + i3*nb03);
               float * dst_row = (      float *)((      char *)dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
         float row_sum = 0;
@@ -26632,7 +26632,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
             case GGML_OP_ACC:
                 {
                     if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                     }
                 } break;
             case GGML_OP_MUL_MAT:
@@ -26692,7 +26692,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                     }
                 } break;
             case GGML_OP_SOFT_MAX:
+            case GGML_OP_SOFT_CAP_MAX:
             case GGML_OP_ROPE:
+            case GGML_OP_ROPE_BACK:
                 {
                     cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                 } break;