ggml : add GGML_OP_COL2IM_1D (#24206)

* cpu: add GGML_OP_COL2IM_1D

Add the overlap-add (scatter-add) step of a 1D transposed convolution.
A ConvTranspose1d factorizes as a GEMM followed by col2im: a weight
pre-permuted to [IC, K*OC] is contracted against the [IC, T_in] input
with mul_mat to produce a column matrix [K*OC, T_in], and col2im_1d
scatters those columns back into the [T_out, OC] signal, with
T_out = (T_in - 1)*s0 + K - 2*p0.

Keeping the contraction as a plain mul_mat leaves the heavy work on the
optimized (and quantizable) matmul kernels, so col2im_1d only does the
cheap overlap-add.

CPU uses a gather formulation parallelized over output channels,
supporting F32, F16 and BF16 with an F32 accumulator.

* tests: add backend coverage for GGML_OP_COL2IM_1D

Add test_col2im_1d next to the conv_transpose_1d cases, covering F32,
F16 and BF16 across eight geometries: the canonical kernel = 2*stride
DAC upsampling shape, overlap, no overlap, cropping (p0 = 1 and
p0 = stride/2), kernel < stride with zeroed gaps, kernel not a
multiple of stride, and a single column unfold.

Perf mode gets three real vocoder stage shapes reporting memory
bandwidth. max_nmse_err relaxes to 5e-4 for F16 and BF16.

* cpu: harden GGML_OP_COL2IM_1D

ggml_col2im_1d validates s0, oc, p0 and input contiguity at graph
build time, before the oc division, protecting every backend at once.
The kernel asserts the contiguity its flat indexing assumes and its
doc states the full output length including the crop term.

The kernel parallelizes over the time axis: the split stays balanced
down to OC = 1, where the previous channel split was single threaded.
Values are bit identical on the three real vocoder chains, two out of
three improve.

* tests: extend the GGML_OP_COL2IM_1D grid

The eval grid grows to eleven geometries: OC = 1 (mono output stage),
K = 1 with stride > 1 (sparse scatter, every gap position zeroed) and
a crop down to T_out = 2 where all the gather bounds act at once.

* tests: add col2im_1d equivalence test

tests/test-col2im-1d.cpp proves mul_mat + col2im_1d matches the
native ggml_conv_transpose_1d on the CPU backend, F32 bit exact, F16
and BF16 through casts of the column matrix. test-backend-ops cannot
cover this for a CPU only op since the CPU backend is its own
reference there.

* rpc: bump protocol patch version for GGML_OP_COL2IM_1D

GGML_OP_COUNT goes from 96 to 97 with the new op, which trips the
static_assert in ggml-rpc.h. Bump RPC_PROTO_PATCH_VERSION since the
op is appended and no existing op code shifts.
This commit is contained in:
Pascal 2026-06-09 11:01:37 +02:00 committed by GitHub
parent 961e9a3e46
commit 26021699bc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 343 additions and 4 deletions

View File

@ -8,10 +8,10 @@ extern "C" {
#define RPC_PROTO_MAJOR_VERSION 4
#define RPC_PROTO_MINOR_VERSION 0
#define RPC_PROTO_PATCH_VERSION 0
#define RPC_PROTO_PATCH_VERSION 1
#ifdef __cplusplus
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
#endif
#define GGML_RPC_MAX_SERVERS 16

View File

@ -535,6 +535,7 @@ extern "C" {
GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_IM2COL_3D,
GGML_OP_COL2IM_1D,
GGML_OP_CONV_2D,
GGML_OP_CONV_3D,
GGML_OP_CONV_2D_DW,
@ -2007,6 +2008,16 @@ extern "C" {
int d1, // dilation dimension 1
bool is_2D);
// col2im_1d: scatter-add GEMM columns back to 1D signal
// a: [K*OC, T_in] (columns from matmul, K = a->ne[0]/OC)
// result: [T_out, OC] where T_out = (T_in - 1)*s0 + K - 2*p0
GGML_API struct ggml_tensor * ggml_col2im_1d(
struct ggml_context * ctx,
struct ggml_tensor * a, // columns [K*OC, T_in]
int s0, // stride
int oc, // output channels
int p0); // padding to crop from both sides
GGML_API struct ggml_tensor * ggml_conv_1d(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel

View File

@ -1912,6 +1912,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_im2col_3d(params, tensor);
} break;
case GGML_OP_COL2IM_1D:
{
ggml_compute_forward_col2im_1d(params, tensor);
} break;
case GGML_OP_CONV_2D:
{
ggml_compute_forward_conv_2d(params, tensor);
@ -2343,6 +2347,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_CONV_2D:
case GGML_OP_CONV_3D:
case GGML_OP_CONV_2D_DW:
case GGML_OP_COL2IM_1D:
case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_CONV_TRANSPOSE_2D:
{

View File

@ -6730,6 +6730,78 @@ static inline int64_t ggml_wrap_around(int64_t coord, int64_t size) {
return (coord + size) % size; // adding size avoids negative number weirdness
}
// ggml_compute_forward_col2im_1d
//
// Scatter-add columns [K*OC, T_in] -> signal [T_out, OC]
// where T_out = (T_in - 1)*s + K - 2*p. Gather approach: each output reads ceil(K/s) inputs.
// Parallelized over the time axis so the split stays balanced whatever OC is.
// Supports F32, F16, BF16 input/output (same type), F32 accumulator.
template <typename elem_t>
static void ggml_compute_forward_col2im_1d_impl(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * src = dst->src[0]; // [K*OC, T_in]
GGML_ASSERT(ggml_is_contiguous(src));
GGML_ASSERT(ggml_is_contiguous(dst));
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
const int32_t OC = ((const int32_t *)(dst->op_params))[1];
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
const int64_t K_OC = src->ne[0];
const int64_t T_in = src->ne[1];
const int64_t K = K_OC / OC;
const int64_t T_out = dst->ne[0];
const elem_t * col_data = (const elem_t *) src->data;
elem_t * dst_data = (elem_t *) dst->data;
const int ith = params->ith;
const int nth = params->nth;
// Parallelize over the time axis: the split stays balanced whatever OC is,
// down to OC = 1 for mono audio, and threads read disjoint column bands
const int64_t dr = (T_out + nth - 1) / nth;
const int64_t it0 = dr * ith;
const int64_t it1 = it0 + dr < T_out ? it0 + dr : T_out;
for (int64_t oc = 0; oc < OC; oc++) {
for (int64_t t_out = it0; t_out < it1; t_out++) {
const int64_t t_abs = t_out + p0; // absolute position in uncropped signal
// Gather: find all (t_in, k) where t_in * s + k == t_abs, 0 <= k < K
int64_t t_in_min = (t_abs - K + 1 + s0 - 1) / s0; // ceil((t_abs-K+1)/s)
if (t_in_min < 0) t_in_min = 0;
int64_t t_in_max = t_abs / s0;
if (t_in_max >= T_in) t_in_max = T_in - 1;
float sum = 0.0f;
for (int64_t t_in = t_in_min; t_in <= t_in_max; t_in++) {
int64_t k = t_abs - t_in * s0;
if (k >= 0 && k < K) {
// col layout: [K*OC, T_in], element (oc*K+k, t_in)
sum += type_conversion_table<elem_t>::to_f32(col_data[(oc * K + k) + t_in * K_OC]);
}
}
// dst layout: [T_out, OC], element (t_out, oc)
dst_data[t_out + oc * T_out] = type_conversion_table<elem_t>::from_f32(sum);
}
}
}
void ggml_compute_forward_col2im_1d(
const ggml_compute_params * params,
ggml_tensor * dst) {
switch (dst->src[0]->type) {
case GGML_TYPE_F32: ggml_compute_forward_col2im_1d_impl<float> (params, dst); break;
case GGML_TYPE_F16: ggml_compute_forward_col2im_1d_impl<ggml_fp16_t>(params, dst); break;
case GGML_TYPE_BF16: ggml_compute_forward_col2im_1d_impl<ggml_bf16_t>(params, dst); break;
default: GGML_ABORT("col2im_1d: unsupported type %d", dst->src[0]->type);
}
}
// ggml_compute_forward_conv_2d

View File

@ -68,6 +68,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_col2im_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);

View File

@ -1031,6 +1031,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"IM2COL",
"IM2COL_BACK",
"IM2COL_3D",
"COL2IM_1D",
"CONV_2D",
"CONV_3D",
"CONV_2D_DW",
@ -1080,7 +1081,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"GLU",
};
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@ -1141,6 +1142,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"im2col(x)",
"im2col_back(x)",
"im2col_3d(x)",
"col2im_1d(x)",
"conv_2d(x)",
"conv_3d(x)",
"conv_2d_dw(x)",
@ -1190,7 +1192,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"glu(x)",
};
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -4541,6 +4543,41 @@ struct ggml_tensor * ggml_conv_1d_dw_ph(
return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
}
// ggml_col2im_1d
struct ggml_tensor * ggml_col2im_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int s0,
int oc,
int p0) {
GGML_ASSERT(ggml_is_matrix(a));
GGML_ASSERT(ggml_is_contiguous(a));
GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16);
GGML_ASSERT(s0 > 0);
GGML_ASSERT(oc > 0);
GGML_ASSERT(p0 >= 0);
const int64_t K_OC = a->ne[0];
const int64_t T_in = a->ne[1];
const int64_t K = K_OC / oc;
const int64_t T_out = (T_in - 1) * s0 + K - 2 * p0;
GGML_ASSERT(K_OC == K * oc); // a->ne[0] must be a whole number of oc blocks
GGML_ASSERT(K > 0 && T_out > 0);
const int64_t ne[4] = { T_out, oc, 1, 1 };
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 2, ne);
int32_t params[] = { s0, (int32_t)oc, (int32_t)p0 };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_COL2IM_1D;
result->src[0] = a;
return result;
}
// ggml_conv_transpose_1d
static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {

View File

@ -265,6 +265,7 @@ if (NOT GGML_BACKEND_DL)
llama_build_and_test(test-quantize-fns.cpp)
llama_build_and_test(test-quantize-perf.cpp)
llama_build_and_test(test-rope.cpp)
llama_build_and_test(test-col2im-1d.cpp)
endif()
# libmtmd

View File

@ -5098,6 +5098,39 @@ struct test_conv_transpose_1d : public test_case {
}
};
// GGML_OP_COL2IM_1D
struct test_col2im_1d : public test_case {
const ggml_type type;
const int64_t K; // kernel size
const int64_t OC; // output channels
const int64_t T_in; // input length (number of columns)
const int s0; // stride
const int p0; // padding cropped from both sides
std::string vars() override {
return VARS_TO_STR6(type, K, OC, T_in, s0, p0);
}
double max_nmse_err() override {
return type == GGML_TYPE_F32 ? 1e-7 : 5e-4;
}
test_col2im_1d(ggml_type type = GGML_TYPE_F32,
int64_t K = 4, int64_t OC = 3, int64_t T_in = 7,
int s0 = 2, int p0 = 0)
: type(type), K(K), OC(OC), T_in(T_in), s0(s0), p0(p0) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * cols = ggml_new_tensor_2d(ctx, type, K*OC, T_in);
ggml_set_name(cols, "cols");
ggml_tensor * out = ggml_col2im_1d(ctx, cols, s0, (int) OC, p0);
ggml_set_name(out, "out");
return out;
}
};
// GGML_OP_CONV_TRANSPOSE_2D
struct test_conv_transpose_2d : public test_case {
// Dimensions
@ -8013,6 +8046,21 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16}) {
// ConvTranspose1d expressed as mul_mat + col2im (DAC decoder upsampling)
test_cases.emplace_back(new test_col2im_1d(type, 16, 32, 197, 8, 0)); // kernel = 2*stride
test_cases.emplace_back(new test_col2im_1d(type, 4, 3, 7, 2, 0));
test_cases.emplace_back(new test_col2im_1d(type, 1, 5, 13, 1, 0)); // stride 1, no overlap
test_cases.emplace_back(new test_col2im_1d(type, 6, 4, 11, 3, 1)); // with cropping
test_cases.emplace_back(new test_col2im_1d(type, 2, 3, 9, 3, 0)); // kernel < stride, gap positions are zeroed
test_cases.emplace_back(new test_col2im_1d(type, 5, 4, 11, 2, 0)); // kernel not a multiple of stride, alternating overlap
test_cases.emplace_back(new test_col2im_1d(type, 8, 4, 13, 4, 2)); // padding = stride/2 (DAC causal cropping)
test_cases.emplace_back(new test_col2im_1d(type, 4, 3, 1, 2, 0)); // single column, pure kernel unfold
test_cases.emplace_back(new test_col2im_1d(type, 16, 1, 197, 8, 0)); // OC = 1, mono output stage
test_cases.emplace_back(new test_col2im_1d(type, 1, 5, 13, 3, 0)); // K = 1 with stride > 1, sparse scatter
test_cases.emplace_back(new test_col2im_1d(type, 8, 2, 3, 2, 5)); // cropping eats most of the signal, T_out = 2
}
for (ggml_type kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1, kernel_type));
test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2, kernel_type));
@ -9366,6 +9414,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2, kernel_type));
}
// Memory bound overlap-add of the GEMM + col2im_1d transposed conv path, real vocoder stage shapes
test_cases.emplace_back(new test_col2im_1d(GGML_TYPE_F32, 16, 512, 2048, 8, 0));
test_cases.emplace_back(new test_col2im_1d(GGML_TYPE_F32, 4, 128, 65536, 2, 0));
test_cases.emplace_back(new test_col2im_1d(GGML_TYPE_F16, 16, 512, 2048, 8, 0));
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));

159
tests/test-col2im-1d.cpp Normal file
View File

@ -0,0 +1,159 @@
// test-col2im-1d.cpp: validate GGML_OP_COL2IM_1D against ggml_conv_transpose_1d.
//
// A ConvTranspose1d factorizes as a GEMM followed by an overlap-add:
// conv_transpose_1d(w, x) equals col2im_1d(mul_mat(w_perm, x_t), s0, OC, p0)
// with w_perm the [IC, K*OC] permutation of the [K, OC, IC] kernel and x_t the
// [IC, T_in] transpose of the [T_in, IC] input. The test derives both alternative
// layouts from one logical weight and one logical input with graph ops only
// (permute + cont + reshape), runs the two paths on the CPU backend, and compares
// them in F32. The F16 and BF16 kernels are exercised by casting the column
// matrix before the scatter. Cropping (p0 > 0) is checked against the shifted
// slice of the uncropped reference, which conv_transpose_1d cannot express.
#include "ggml.h"
#include "ggml-cpu.h"
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <vector>
// One geometry: kernel size, output channels, input length, stride, crop
struct col2im_case {
int64_t K;
int64_t OC;
int64_t T_in;
int s0;
int p0;
};
// Mirrors the eval grid of test-backend-ops
static const col2im_case CASES[] = {
{ 16, 32, 197, 8, 0 }, // kernel = 2*stride, DAC upsampling shape
{ 4, 3, 7, 2, 0 },
{ 1, 5, 13, 1, 0 }, // stride 1, no overlap
{ 6, 4, 11, 3, 1 }, // with cropping
{ 2, 3, 9, 3, 0 }, // kernel < stride, gap positions are zeroed
{ 5, 4, 11, 2, 0 }, // kernel not a multiple of stride, alternating overlap
{ 8, 4, 13, 4, 2 }, // padding = stride/2, DAC causal cropping
{ 4, 3, 1, 2, 0 }, // single column, pure kernel unfold
{ 16, 1, 197, 8, 0 }, // OC = 1, mono output stage
{ 1, 5, 13, 3, 0 }, // K = 1 with stride > 1, sparse scatter
{ 8, 2, 3, 2, 5 }, // cropping eats most of the signal, T_out = 2
};
// Input channels of the GEMM, shared by every case
static const int64_t IC = 7;
// Deterministic LCG mapped to [-1, 1]
static uint64_t g_rng = 0x12345678ULL;
static float frand(void) {
g_rng = g_rng * 6364136223846793005ULL + 1442695040888963407ULL;
return (float)((g_rng >> 33) & 0xffffff) / (float)0x800000 - 1.0f;
}
// Read a F32/F16/BF16 tensor back as a flat F32 vector
static std::vector<float> tensor_to_f32(const struct ggml_tensor * t) {
const int64_t n = ggml_nelements(t);
std::vector<float> out(n);
if (t->type == GGML_TYPE_F32) {
memcpy(out.data(), t->data, n * sizeof(float));
} else if (t->type == GGML_TYPE_F16) {
for (int64_t i = 0; i < n; i++) {
out[i] = ggml_fp16_to_fp32(((const ggml_fp16_t *) t->data)[i]);
}
} else {
for (int64_t i = 0; i < n; i++) {
out[i] = ggml_bf16_to_fp32(((const ggml_bf16_t *) t->data)[i]);
}
}
return out;
}
// NMSE of the cropped output against the p0 shifted slice of the full reference
static double nmse_cropped(const float * y, const float * ref, int64_t T_out, int64_t T_ref, int64_t OC, int p0) {
double num = 0.0;
double den = 0.0;
for (int64_t oc = 0; oc < OC; oc++) {
for (int64_t t = 0; t < T_out; t++) {
const double a = y [t + oc * T_out];
const double b = ref[t + p0 + oc * T_ref];
num += (a - b) * (a - b);
den += b * b;
}
}
return num / (den + 1e-30);
}
int main(void) {
int fails = 0;
for (const col2im_case & c : CASES) {
const int64_t T_ref = (c.T_in - 1) * c.s0 + c.K;
const int64_t T_out = T_ref - 2 * c.p0;
struct ggml_init_params params = {
/* .mem_size = */ (size_t) 64 << 20,
/* .mem_base = */ NULL,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(params);
// One logical weight and one logical input feed both paths
struct ggml_tensor * w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, c.K, c.OC, IC);
struct ggml_tensor * x = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c.T_in, IC);
for (int64_t i = 0; i < ggml_nelements(w); i++) {
((float *) w->data)[i] = frand();
}
for (int64_t i = 0; i < ggml_nelements(x); i++) {
((float *) x->data)[i] = frand();
}
// Reference path: the native op, uncropped
struct ggml_tensor * y_ref = ggml_conv_transpose_1d(ctx, w, x, c.s0, 0, 1);
// Decomposed path: [K, OC, IC] -> [IC, K, OC] -> [IC, K*OC], k fastest inside each oc block
struct ggml_tensor * w_perm = ggml_cont(ctx, ggml_permute(ctx, w, 1, 2, 0, 3));
w_perm = ggml_reshape_2d(ctx, w_perm, IC, c.K * c.OC);
struct ggml_tensor * x_t = ggml_cont(ctx, ggml_transpose(ctx, x));
struct ggml_tensor * col = ggml_mul_mat(ctx, w_perm, x_t);
struct ggml_tensor * y32 = ggml_col2im_1d(ctx, col, c.s0, (int) c.OC, c.p0);
// Half precision kernels: the same columns cast before the scatter
struct ggml_tensor * y16 = ggml_col2im_1d(ctx, ggml_cast(ctx, col, GGML_TYPE_F16), c.s0, (int) c.OC, c.p0);
struct ggml_tensor * ybf = ggml_col2im_1d(ctx, ggml_cast(ctx, col, GGML_TYPE_BF16), c.s0, (int) c.OC, c.p0);
GGML_ASSERT(y_ref->ne[0] == T_ref && y_ref->ne[1] == c.OC);
GGML_ASSERT(y32->ne[0] == T_out && y32->ne[1] == c.OC);
struct ggml_cgraph * gf = ggml_new_graph(ctx);
ggml_build_forward_expand(gf, y_ref);
ggml_build_forward_expand(gf, y32);
ggml_build_forward_expand(gf, y16);
ggml_build_forward_expand(gf, ybf);
ggml_graph_compute_with_ctx(ctx, gf, 4);
const std::vector<float> f32 = tensor_to_f32(y32);
const std::vector<float> f16 = tensor_to_f32(y16);
const std::vector<float> fbf = tensor_to_f32(ybf);
const float * ref = (const float *) y_ref->data;
const double e32 = nmse_cropped(f32.data(), ref, T_out, T_ref, c.OC, c.p0);
const double e16 = nmse_cropped(f16.data(), ref, T_out, T_ref, c.OC, c.p0);
const double ebf = nmse_cropped(fbf.data(), ref, T_out, T_ref, c.OC, c.p0);
// Same thresholds as test-backend-ops: 1e-7 full precision, 5e-4 half
const bool ok = e32 <= 1e-7 && e16 <= 5e-4 && ebf <= 5e-4;
if (!ok) {
fails++;
}
printf("col2im_1d K=%2d OC=%2d T_in=%3d s0=%d p0=%d: nmse f32=%.2e f16=%.2e bf16=%.2e %s\n",
(int) c.K, (int) c.OC, (int) c.T_in, c.s0, c.p0, e32, e16, ebf, ok ? "OK" : "FAIL");
ggml_free(ctx);
}
printf(fails == 0 ? "all col2im_1d checks passed\n" : "%d col2im_1d checks FAILED\n", fails);
return fails == 0 ? 0 : 1;
}