Fix minor GGML discrepencies (#2016)

* fix: wrong stride in batched quantized add1 (nb0 -> nb3) ggml_compute_forward_add1_q_f32 used i3*nb0 (element stride) instead of i3*nb3 (batch stride) for the destination row pointer. This causes all add1 operations with quantized types and batch > 1 to write to wrong memory locations. The src0 pointer on the line above correctly uses nb03. * fix: wrong dimension limits in dup_f16 non-contiguous path The destination index wrapping in ggml_compute_forward_dup_f16 used source dimensions (ne00/ne01/ne02/ne03) instead of destination dimensions (ne0/ne1/ne2/ne3). While source and destination shapes are currently identical for dup, using the wrong variables is incorrect by design. * fix: wrong dimension limits in dup_bf16 non-contiguous path Same fix as the dup_f16 path: destination index wrapping used source dimensions (ne00/ne01/ne02/ne03) instead of destination dimensions (ne0/ne1/ne2/ne3). Copy-paste error from the contiguous path. * fix: ACC work size uses src[1] instead of src[0] The dequantization work buffer for quantized ACC was sized using src[1]->ne[0] instead of src[0]->ne[0]. Since src[0] is the tensor being dequantized, its dimensions should determine the buffer size. * fix: missing work size for SOFT_CAP_MAX and ROPE_BACK Both ops dereference params->wdata in their forward functions but had no work size allocation (cur = 0), causing a NULL pointer dereference when any thread attempted to use wdata. * fix: wrong dim in sum_rows_f32 dimension decomposition Line 14404 used ne01*ne0 (= ne01*1) instead of ne01*ne02 for the i3 term in the flat row index formula. When ne02 > 1 (batched 2D inputs), this causes wrong memory access and corrupted results.
2026-06-28 04:30:15 -05:00 · 2026-06-24 09:09:33 +02:00 · 2026-06-24 09:09:33 +02:00 · 7cacf28eec
commit 7cacf28eec
parent 8686ea708b
1 changed files with 13 additions and 11 deletions
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -11415,13 +11415,13 @@ static void ggml_compute_forward_dup_f16(

                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));

-                        if (++i10 == ne00) {
+                        if (++i10 == ne0) {
                            i10 = 0;
-                            if (++i11 == ne01) {
+                            if (++i11 == ne1) {
                                i11 = 0;
-                                if (++i12 == ne02) {
+                                if (++i12 == ne2) {
                                    i12 = 0;
-                                    if (++i13 == ne03) {
+                                    if (++i13 == ne3) {
                                        i13 = 0;
                                    }
                                }
@ -11719,13 +11719,13 @@ static void ggml_compute_forward_dup_bf16(

                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_bf16_t));

-                        if (++i10 == ne00) {
+                        if (++i10 == ne0) {
                            i10 = 0;
-                            if (++i11 == ne01) {
+                            if (++i11 == ne1) {
                                i11 = 0;
-                                if (++i12 == ne02) {
+                                if (++i12 == ne2) {
                                    i12 = 0;
-                                    if (++i13 == ne03) {
+                                    if (++i13 == ne3) {
                                        i13 = 0;
                                    }
                                }
@ -13341,7 +13341,7 @@ static void ggml_compute_forward_add1_q_f32(
        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

        void  * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03));
-        void  * dst_row  = (void *) ((char *)  dst->data + (i1*nb1  + i2*nb2  + i3*nb0 ));
+        void  * dst_row  = (void *) ((char *)  dst->data + (i1*nb1  + i2*nb2  + i3*nb3 ));

        assert(ne0 % 32 == 0);

@ -14401,7 +14401,7 @@ static void ggml_compute_forward_sum_rows_f32(
    for (int ir = first_row; ir < last_row; ++ir) {
        int i3 = ir / (ne01*ne02);
        int i2 = (ir - i3*ne01*ne02)/ne01;
-        int i1 = ir - i3*ne01*ne0 - i2*ne01;
+        int i1 = ir - i3*ne01*ne02 - i2*ne01;
        const float * src_row = (const float *)((const char *)src0->data + i1*nb01 + i2*nb02 + i3*nb03);
              float * dst_row = (      float *)((      char *)dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
        float row_sum = 0;
@ -26632,7 +26632,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
            case GGML_OP_ACC:
                {
                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                    }
                } break;
            case GGML_OP_MUL_MAT:
@ -26692,7 +26692,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                    }
                } break;
            case GGML_OP_SOFT_MAX:
+            case GGML_OP_SOFT_CAP_MAX:
            case GGML_OP_ROPE:
+            case GGML_OP_ROPE_BACK:
                {
                    cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                } break;