diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h
index 6fcf5a4339..5ad121ae57 100644
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@@ -8,10 +8,10 @@ extern "C" {
 
 #define RPC_PROTO_MAJOR_VERSION    4
 #define RPC_PROTO_MINOR_VERSION    0
-#define RPC_PROTO_PATCH_VERSION    0
+#define RPC_PROTO_PATCH_VERSION    1
 
 #ifdef  __cplusplus
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
+static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
 #endif
 
 #define GGML_RPC_MAX_SERVERS       16
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index f672526550..374934aacf 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -535,6 +535,7 @@ extern "C" {
         GGML_OP_IM2COL,
         GGML_OP_IM2COL_BACK,
         GGML_OP_IM2COL_3D,
+        GGML_OP_COL2IM_1D,
         GGML_OP_CONV_2D,
         GGML_OP_CONV_3D,
         GGML_OP_CONV_2D_DW,
@@ -2007,6 +2008,16 @@ extern "C" {
         int                   d1, // dilation dimension 1
         bool                  is_2D);
 
+    // col2im_1d: scatter-add GEMM columns back to 1D signal
+    // a: [K*OC, T_in]  (columns from matmul, K = a->ne[0]/OC)
+    // result: [T_out, OC]  where T_out = (T_in - 1)*s0 + K - 2*p0
+    GGML_API struct ggml_tensor * ggml_col2im_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,   // columns [K*OC, T_in]
+        int                   s0,  // stride
+        int                   oc,  // output channels
+        int                   p0); // padding to crop from both sides
+
     GGML_API struct ggml_tensor * ggml_conv_1d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,   // convolution kernel
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index cd5c61a818..af7827aec3 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1912,6 +1912,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_im2col_3d(params, tensor);
             } break;
+        case GGML_OP_COL2IM_1D:
+            {
+                ggml_compute_forward_col2im_1d(params, tensor);
+            } break;
         case GGML_OP_CONV_2D:
             {
                 ggml_compute_forward_conv_2d(params, tensor);
@@ -2343,6 +2347,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_CONV_2D:
         case GGML_OP_CONV_3D:
         case GGML_OP_CONV_2D_DW:
+        case GGML_OP_COL2IM_1D:
         case GGML_OP_CONV_TRANSPOSE_1D:
         case GGML_OP_CONV_TRANSPOSE_2D:
             {
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index becac9d6ef..86842e5547 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6730,6 +6730,78 @@ static inline int64_t ggml_wrap_around(int64_t coord, int64_t size) {
     return (coord  + size) % size; // adding size avoids negative number weirdness
 }
 
+// ggml_compute_forward_col2im_1d
+//
+// Scatter-add columns [K*OC, T_in] -> signal [T_out, OC]
+// where T_out = (T_in - 1)*s + K - 2*p.  Gather approach: each output reads ceil(K/s) inputs.
+// Parallelized over the time axis so the split stays balanced whatever OC is.
+// Supports F32, F16, BF16 input/output (same type), F32 accumulator.
+
+template <typename elem_t>
+static void ggml_compute_forward_col2im_1d_impl(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src = dst->src[0];  // [K*OC, T_in]
+
+    GGML_ASSERT(ggml_is_contiguous(src));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t OC = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+
+    const int64_t K_OC = src->ne[0];
+    const int64_t T_in = src->ne[1];
+    const int64_t K    = K_OC / OC;
+    const int64_t T_out = dst->ne[0];
+
+    const elem_t * col_data = (const elem_t *) src->data;
+    elem_t       * dst_data = (elem_t *) dst->data;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // Parallelize over the time axis: the split stays balanced whatever OC is,
+    // down to OC = 1 for mono audio, and threads read disjoint column bands
+    const int64_t dr = (T_out + nth - 1) / nth;
+    const int64_t it0 = dr * ith;
+    const int64_t it1 = it0 + dr < T_out ? it0 + dr : T_out;
+
+    for (int64_t oc = 0; oc < OC; oc++) {
+        for (int64_t t_out = it0; t_out < it1; t_out++) {
+            const int64_t t_abs = t_out + p0;  // absolute position in uncropped signal
+            // Gather: find all (t_in, k) where t_in * s + k == t_abs, 0 <= k < K
+            int64_t t_in_min = (t_abs - K + 1 + s0 - 1) / s0;  // ceil((t_abs-K+1)/s)
+            if (t_in_min < 0) t_in_min = 0;
+            int64_t t_in_max = t_abs / s0;
+            if (t_in_max >= T_in) t_in_max = T_in - 1;
+
+            float sum = 0.0f;
+            for (int64_t t_in = t_in_min; t_in <= t_in_max; t_in++) {
+                int64_t k = t_abs - t_in * s0;
+                if (k >= 0 && k < K) {
+                    // col layout: [K*OC, T_in], element (oc*K+k, t_in)
+                    sum += type_conversion_table<elem_t>::to_f32(col_data[(oc * K + k) + t_in * K_OC]);
+                }
+            }
+            // dst layout: [T_out, OC], element (t_out, oc)
+            dst_data[t_out + oc * T_out] = type_conversion_table<elem_t>::from_f32(sum);
+        }
+    }
+}
+
+void ggml_compute_forward_col2im_1d(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    switch (dst->src[0]->type) {
+        case GGML_TYPE_F32:  ggml_compute_forward_col2im_1d_impl<float>      (params, dst); break;
+        case GGML_TYPE_F16:  ggml_compute_forward_col2im_1d_impl<ggml_fp16_t>(params, dst); break;
+        case GGML_TYPE_BF16: ggml_compute_forward_col2im_1d_impl<ggml_bf16_t>(params, dst); break;
+        default: GGML_ABORT("col2im_1d: unsupported type %d", dst->src[0]->type);
+    }
+}
+
 // ggml_compute_forward_conv_2d
 
 
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 7398e56189..a8e18c716d 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -68,6 +68,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
 void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_col2im_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 8815c67d8b..18a5ebd2ab 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1031,6 +1031,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "IM2COL",
     "IM2COL_BACK",
     "IM2COL_3D",
+    "COL2IM_1D",
     "CONV_2D",
     "CONV_3D",
     "CONV_2D_DW",
@@ -1080,7 +1081,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "GLU",
 };
 
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1141,6 +1142,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "im2col(x)",
     "im2col_back(x)",
     "im2col_3d(x)",
+    "col2im_1d(x)",
     "conv_2d(x)",
     "conv_3d(x)",
     "conv_2d_dw(x)",
@@ -1190,7 +1192,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "glu(x)",
 };
 
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -4541,6 +4543,41 @@ struct ggml_tensor * ggml_conv_1d_dw_ph(
     return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
 }
 
+// ggml_col2im_1d
+
+struct ggml_tensor * ggml_col2im_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   s0,
+        int                   oc,
+        int                   p0) {
+    GGML_ASSERT(ggml_is_matrix(a));
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16);
+    GGML_ASSERT(s0 > 0);
+    GGML_ASSERT(oc > 0);
+    GGML_ASSERT(p0 >= 0);
+
+    const int64_t K_OC = a->ne[0];
+    const int64_t T_in = a->ne[1];
+    const int64_t K = K_OC / oc;
+    const int64_t T_out = (T_in - 1) * s0 + K - 2 * p0;
+
+    GGML_ASSERT(K_OC == K * oc);  // a->ne[0] must be a whole number of oc blocks
+    GGML_ASSERT(K > 0 && T_out > 0);
+
+    const int64_t ne[4] = { T_out, oc, 1, 1 };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 2, ne);
+
+    int32_t params[] = { s0, (int32_t)oc, (int32_t)p0 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_COL2IM_1D;
+    result->src[0] = a;
+
+    return result;
+}
+
 // ggml_conv_transpose_1d
 
 static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 33ae3b303c..e284a58d1c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -265,6 +265,7 @@ if (NOT GGML_BACKEND_DL)
     llama_build_and_test(test-quantize-fns.cpp)
     llama_build_and_test(test-quantize-perf.cpp)
     llama_build_and_test(test-rope.cpp)
+    llama_build_and_test(test-col2im-1d.cpp)
 endif()
 
 # libmtmd
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index f561d09b5b..c30b4e9815 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -5098,6 +5098,39 @@ struct test_conv_transpose_1d : public test_case {
     }
 };
 
+// GGML_OP_COL2IM_1D
+struct test_col2im_1d : public test_case {
+    const ggml_type type;
+    const int64_t K;    // kernel size
+    const int64_t OC;   // output channels
+    const int64_t T_in; // input length (number of columns)
+    const int s0;       // stride
+    const int p0;       // padding cropped from both sides
+
+    std::string vars() override {
+        return VARS_TO_STR6(type, K, OC, T_in, s0, p0);
+    }
+
+    double max_nmse_err() override {
+        return type == GGML_TYPE_F32 ? 1e-7 : 5e-4;
+    }
+
+    test_col2im_1d(ggml_type type = GGML_TYPE_F32,
+            int64_t K = 4, int64_t OC = 3, int64_t T_in = 7,
+            int s0 = 2, int p0 = 0)
+        : type(type), K(K), OC(OC), T_in(T_in), s0(s0), p0(p0) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * cols = ggml_new_tensor_2d(ctx, type, K*OC, T_in);
+        ggml_set_name(cols, "cols");
+
+        ggml_tensor * out = ggml_col2im_1d(ctx, cols, s0, (int) OC, p0);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
 // GGML_OP_CONV_TRANSPOSE_2D
 struct test_conv_transpose_2d : public test_case {
     // Dimensions
@@ -8013,6 +8046,21 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
     test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
 
+    for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16}) {
+        // ConvTranspose1d expressed as mul_mat + col2im (DAC decoder upsampling)
+        test_cases.emplace_back(new test_col2im_1d(type, 16, 32, 197, 8, 0)); // kernel = 2*stride
+        test_cases.emplace_back(new test_col2im_1d(type, 4, 3, 7, 2, 0));
+        test_cases.emplace_back(new test_col2im_1d(type, 1, 5, 13, 1, 0));    // stride 1, no overlap
+        test_cases.emplace_back(new test_col2im_1d(type, 6, 4, 11, 3, 1));    // with cropping
+        test_cases.emplace_back(new test_col2im_1d(type, 2, 3, 9, 3, 0));     // kernel < stride, gap positions are zeroed
+        test_cases.emplace_back(new test_col2im_1d(type, 5, 4, 11, 2, 0));    // kernel not a multiple of stride, alternating overlap
+        test_cases.emplace_back(new test_col2im_1d(type, 8, 4, 13, 4, 2));    // padding = stride/2 (DAC causal cropping)
+        test_cases.emplace_back(new test_col2im_1d(type, 4, 3, 1, 2, 0));     // single column, pure kernel unfold
+        test_cases.emplace_back(new test_col2im_1d(type, 16, 1, 197, 8, 0));   // OC = 1, mono output stage
+        test_cases.emplace_back(new test_col2im_1d(type, 1, 5, 13, 3, 0));     // K = 1 with stride > 1, sparse scatter
+        test_cases.emplace_back(new test_col2im_1d(type, 8, 2, 3, 2, 5));      // cropping eats most of the signal, T_out = 2
+    }
+
     for (ggml_type kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
         test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1, kernel_type));
         test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2, kernel_type));
@@ -9366,6 +9414,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
         test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2, kernel_type));
     }
 
+    // Memory bound overlap-add of the GEMM + col2im_1d transposed conv path, real vocoder stage shapes
+    test_cases.emplace_back(new test_col2im_1d(GGML_TYPE_F32, 16, 512, 2048, 8, 0));
+    test_cases.emplace_back(new test_col2im_1d(GGML_TYPE_F32, 4, 128, 65536, 2, 0));
+    test_cases.emplace_back(new test_col2im_1d(GGML_TYPE_F16, 16, 512, 2048, 8, 0));
+
     test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));
 
 
diff --git a/tests/test-col2im-1d.cpp b/tests/test-col2im-1d.cpp
new file mode 100644
index 0000000000..f1d36479b3
--- /dev/null
+++ b/tests/test-col2im-1d.cpp
@@ -0,0 +1,159 @@
+// test-col2im-1d.cpp: validate GGML_OP_COL2IM_1D against ggml_conv_transpose_1d.
+//
+// A ConvTranspose1d factorizes as a GEMM followed by an overlap-add:
+//   conv_transpose_1d(w, x)  equals  col2im_1d(mul_mat(w_perm, x_t), s0, OC, p0)
+// with w_perm the [IC, K*OC] permutation of the [K, OC, IC] kernel and x_t the
+// [IC, T_in] transpose of the [T_in, IC] input. The test derives both alternative
+// layouts from one logical weight and one logical input with graph ops only
+// (permute + cont + reshape), runs the two paths on the CPU backend, and compares
+// them in F32. The F16 and BF16 kernels are exercised by casting the column
+// matrix before the scatter. Cropping (p0 > 0) is checked against the shifted
+// slice of the uncropped reference, which conv_transpose_1d cannot express.
+
+#include "ggml.h"
+#include "ggml-cpu.h"
+
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <vector>
+
+// One geometry: kernel size, output channels, input length, stride, crop
+struct col2im_case {
+    int64_t K;
+    int64_t OC;
+    int64_t T_in;
+    int     s0;
+    int     p0;
+};
+
+// Mirrors the eval grid of test-backend-ops
+static const col2im_case CASES[] = {
+    { 16, 32, 197, 8, 0 },  // kernel = 2*stride, DAC upsampling shape
+    {  4,  3,   7, 2, 0 },
+    {  1,  5,  13, 1, 0 },  // stride 1, no overlap
+    {  6,  4,  11, 3, 1 },  // with cropping
+    {  2,  3,   9, 3, 0 },  // kernel < stride, gap positions are zeroed
+    {  5,  4,  11, 2, 0 },  // kernel not a multiple of stride, alternating overlap
+    {  8,  4,  13, 4, 2 },  // padding = stride/2, DAC causal cropping
+    {  4,  3,   1, 2, 0 },  // single column, pure kernel unfold
+    { 16,  1, 197, 8, 0 },  // OC = 1, mono output stage
+    {  1,  5,  13, 3, 0 },  // K = 1 with stride > 1, sparse scatter
+    {  8,  2,   3, 2, 5 },  // cropping eats most of the signal, T_out = 2
+};
+
+// Input channels of the GEMM, shared by every case
+static const int64_t IC = 7;
+
+// Deterministic LCG mapped to [-1, 1]
+static uint64_t g_rng = 0x12345678ULL;
+static float frand(void) {
+    g_rng = g_rng * 6364136223846793005ULL + 1442695040888963407ULL;
+    return (float)((g_rng >> 33) & 0xffffff) / (float)0x800000 - 1.0f;
+}
+
+// Read a F32/F16/BF16 tensor back as a flat F32 vector
+static std::vector<float> tensor_to_f32(const struct ggml_tensor * t) {
+    const int64_t n = ggml_nelements(t);
+    std::vector<float> out(n);
+    if (t->type == GGML_TYPE_F32) {
+        memcpy(out.data(), t->data, n * sizeof(float));
+    } else if (t->type == GGML_TYPE_F16) {
+        for (int64_t i = 0; i < n; i++) {
+            out[i] = ggml_fp16_to_fp32(((const ggml_fp16_t *) t->data)[i]);
+        }
+    } else {
+        for (int64_t i = 0; i < n; i++) {
+            out[i] = ggml_bf16_to_fp32(((const ggml_bf16_t *) t->data)[i]);
+        }
+    }
+    return out;
+}
+
+// NMSE of the cropped output against the p0 shifted slice of the full reference
+static double nmse_cropped(const float * y, const float * ref, int64_t T_out, int64_t T_ref, int64_t OC, int p0) {
+    double num = 0.0;
+    double den = 0.0;
+    for (int64_t oc = 0; oc < OC; oc++) {
+        for (int64_t t = 0; t < T_out; t++) {
+            const double a = y  [t      + oc * T_out];
+            const double b = ref[t + p0 + oc * T_ref];
+            num += (a - b) * (a - b);
+            den += b * b;
+        }
+    }
+    return num / (den + 1e-30);
+}
+
+int main(void) {
+    int fails = 0;
+
+    for (const col2im_case & c : CASES) {
+        const int64_t T_ref = (c.T_in - 1) * c.s0 + c.K;
+        const int64_t T_out = T_ref - 2 * c.p0;
+
+        struct ggml_init_params params = {
+            /* .mem_size   = */ (size_t) 64 << 20,
+            /* .mem_base   = */ NULL,
+            /* .no_alloc   = */ false,
+        };
+        struct ggml_context * ctx = ggml_init(params);
+
+        // One logical weight and one logical input feed both paths
+        struct ggml_tensor * w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, c.K, c.OC, IC);
+        struct ggml_tensor * x = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c.T_in, IC);
+        for (int64_t i = 0; i < ggml_nelements(w); i++) {
+            ((float *) w->data)[i] = frand();
+        }
+        for (int64_t i = 0; i < ggml_nelements(x); i++) {
+            ((float *) x->data)[i] = frand();
+        }
+
+        // Reference path: the native op, uncropped
+        struct ggml_tensor * y_ref = ggml_conv_transpose_1d(ctx, w, x, c.s0, 0, 1);
+
+        // Decomposed path: [K, OC, IC] -> [IC, K, OC] -> [IC, K*OC], k fastest inside each oc block
+        struct ggml_tensor * w_perm = ggml_cont(ctx, ggml_permute(ctx, w, 1, 2, 0, 3));
+        w_perm                      = ggml_reshape_2d(ctx, w_perm, IC, c.K * c.OC);
+        struct ggml_tensor * x_t    = ggml_cont(ctx, ggml_transpose(ctx, x));
+        struct ggml_tensor * col    = ggml_mul_mat(ctx, w_perm, x_t);
+        struct ggml_tensor * y32    = ggml_col2im_1d(ctx, col, c.s0, (int) c.OC, c.p0);
+
+        // Half precision kernels: the same columns cast before the scatter
+        struct ggml_tensor * y16 = ggml_col2im_1d(ctx, ggml_cast(ctx, col, GGML_TYPE_F16),  c.s0, (int) c.OC, c.p0);
+        struct ggml_tensor * ybf = ggml_col2im_1d(ctx, ggml_cast(ctx, col, GGML_TYPE_BF16), c.s0, (int) c.OC, c.p0);
+
+        GGML_ASSERT(y_ref->ne[0] == T_ref && y_ref->ne[1] == c.OC);
+        GGML_ASSERT(y32->ne[0] == T_out && y32->ne[1] == c.OC);
+
+        struct ggml_cgraph * gf = ggml_new_graph(ctx);
+        ggml_build_forward_expand(gf, y_ref);
+        ggml_build_forward_expand(gf, y32);
+        ggml_build_forward_expand(gf, y16);
+        ggml_build_forward_expand(gf, ybf);
+        ggml_graph_compute_with_ctx(ctx, gf, 4);
+
+        const std::vector<float> f32 = tensor_to_f32(y32);
+        const std::vector<float> f16 = tensor_to_f32(y16);
+        const std::vector<float> fbf = tensor_to_f32(ybf);
+        const float * ref = (const float *) y_ref->data;
+
+        const double e32 = nmse_cropped(f32.data(), ref, T_out, T_ref, c.OC, c.p0);
+        const double e16 = nmse_cropped(f16.data(), ref, T_out, T_ref, c.OC, c.p0);
+        const double ebf = nmse_cropped(fbf.data(), ref, T_out, T_ref, c.OC, c.p0);
+
+        // Same thresholds as test-backend-ops: 1e-7 full precision, 5e-4 half
+        const bool ok = e32 <= 1e-7 && e16 <= 5e-4 && ebf <= 5e-4;
+        if (!ok) {
+            fails++;
+        }
+        printf("col2im_1d K=%2d OC=%2d T_in=%3d s0=%d p0=%d: nmse f32=%.2e f16=%.2e bf16=%.2e %s\n",
+            (int) c.K, (int) c.OC, (int) c.T_in, c.s0, c.p0, e32, e16, ebf, ok ? "OK" : "FAIL");
+
+        ggml_free(ctx);
+    }
+
+    printf(fails == 0 ? "all col2im_1d checks passed\n" : "%d col2im_1d checks FAILED\n", fails);
+    return fails == 0 ? 0 : 1;
+}