diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h index 6fcf5a4339..5ad121ae57 100644 --- a/ggml/include/ggml-rpc.h +++ b/ggml/include/ggml-rpc.h @@ -8,10 +8,10 @@ extern "C" { #define RPC_PROTO_MAJOR_VERSION 4 #define RPC_PROTO_MINOR_VERSION 0 -#define RPC_PROTO_PATCH_VERSION 0 +#define RPC_PROTO_PATCH_VERSION 1 #ifdef __cplusplus -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION"); +static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION"); #endif #define GGML_RPC_MAX_SERVERS 16 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index f672526550..374934aacf 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -535,6 +535,7 @@ extern "C" { GGML_OP_IM2COL, GGML_OP_IM2COL_BACK, GGML_OP_IM2COL_3D, + GGML_OP_COL2IM_1D, GGML_OP_CONV_2D, GGML_OP_CONV_3D, GGML_OP_CONV_2D_DW, @@ -2007,6 +2008,16 @@ extern "C" { int d1, // dilation dimension 1 bool is_2D); + // col2im_1d: scatter-add GEMM columns back to 1D signal + // a: [K*OC, T_in] (columns from matmul, K = a->ne[0]/OC) + // result: [T_out, OC] where T_out = (T_in - 1)*s0 + K - 2*p0 + GGML_API struct ggml_tensor * ggml_col2im_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, // columns [K*OC, T_in] + int s0, // stride + int oc, // output channels + int p0); // padding to crop from both sides + GGML_API struct ggml_tensor * ggml_conv_1d( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index cd5c61a818..af7827aec3 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1912,6 +1912,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_im2col_3d(params, tensor); } break; + case GGML_OP_COL2IM_1D: + { + ggml_compute_forward_col2im_1d(params, tensor); + } break; case GGML_OP_CONV_2D: { ggml_compute_forward_conv_2d(params, tensor); @@ -2343,6 +2347,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_CONV_2D: case GGML_OP_CONV_3D: case GGML_OP_CONV_2D_DW: + case GGML_OP_COL2IM_1D: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_2D: { diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index becac9d6ef..86842e5547 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6730,6 +6730,78 @@ static inline int64_t ggml_wrap_around(int64_t coord, int64_t size) { return (coord + size) % size; // adding size avoids negative number weirdness } +// ggml_compute_forward_col2im_1d +// +// Scatter-add columns [K*OC, T_in] -> signal [T_out, OC] +// where T_out = (T_in - 1)*s + K - 2*p. Gather approach: each output reads ceil(K/s) inputs. +// Parallelized over the time axis so the split stays balanced whatever OC is. +// Supports F32, F16, BF16 input/output (same type), F32 accumulator. + +template +static void ggml_compute_forward_col2im_1d_impl( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src = dst->src[0]; // [K*OC, T_in] + + GGML_ASSERT(ggml_is_contiguous(src)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t OC = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + + const int64_t K_OC = src->ne[0]; + const int64_t T_in = src->ne[1]; + const int64_t K = K_OC / OC; + const int64_t T_out = dst->ne[0]; + + const elem_t * col_data = (const elem_t *) src->data; + elem_t * dst_data = (elem_t *) dst->data; + + const int ith = params->ith; + const int nth = params->nth; + + // Parallelize over the time axis: the split stays balanced whatever OC is, + // down to OC = 1 for mono audio, and threads read disjoint column bands + const int64_t dr = (T_out + nth - 1) / nth; + const int64_t it0 = dr * ith; + const int64_t it1 = it0 + dr < T_out ? it0 + dr : T_out; + + for (int64_t oc = 0; oc < OC; oc++) { + for (int64_t t_out = it0; t_out < it1; t_out++) { + const int64_t t_abs = t_out + p0; // absolute position in uncropped signal + // Gather: find all (t_in, k) where t_in * s + k == t_abs, 0 <= k < K + int64_t t_in_min = (t_abs - K + 1 + s0 - 1) / s0; // ceil((t_abs-K+1)/s) + if (t_in_min < 0) t_in_min = 0; + int64_t t_in_max = t_abs / s0; + if (t_in_max >= T_in) t_in_max = T_in - 1; + + float sum = 0.0f; + for (int64_t t_in = t_in_min; t_in <= t_in_max; t_in++) { + int64_t k = t_abs - t_in * s0; + if (k >= 0 && k < K) { + // col layout: [K*OC, T_in], element (oc*K+k, t_in) + sum += type_conversion_table::to_f32(col_data[(oc * K + k) + t_in * K_OC]); + } + } + // dst layout: [T_out, OC], element (t_out, oc) + dst_data[t_out + oc * T_out] = type_conversion_table::from_f32(sum); + } + } +} + +void ggml_compute_forward_col2im_1d( + const ggml_compute_params * params, + ggml_tensor * dst) { + switch (dst->src[0]->type) { + case GGML_TYPE_F32: ggml_compute_forward_col2im_1d_impl (params, dst); break; + case GGML_TYPE_F16: ggml_compute_forward_col2im_1d_impl(params, dst); break; + case GGML_TYPE_BF16: ggml_compute_forward_col2im_1d_impl(params, dst); break; + default: GGML_ABORT("col2im_1d: unsupported type %d", dst->src[0]->type); + } +} + // ggml_compute_forward_conv_2d diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 7398e56189..a8e18c716d 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -68,6 +68,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_col2im_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 8815c67d8b..18a5ebd2ab 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1031,6 +1031,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "IM2COL", "IM2COL_BACK", "IM2COL_3D", + "COL2IM_1D", "CONV_2D", "CONV_3D", "CONV_2D_DW", @@ -1080,7 +1081,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); +static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1141,6 +1142,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "im2col(x)", "im2col_back(x)", "im2col_3d(x)", + "col2im_1d(x)", "conv_2d(x)", "conv_3d(x)", "conv_2d_dw(x)", @@ -1190,7 +1192,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); +static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4541,6 +4543,41 @@ struct ggml_tensor * ggml_conv_1d_dw_ph( return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0); } +// ggml_col2im_1d + +struct ggml_tensor * ggml_col2im_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int s0, + int oc, + int p0) { + GGML_ASSERT(ggml_is_matrix(a)); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16); + GGML_ASSERT(s0 > 0); + GGML_ASSERT(oc > 0); + GGML_ASSERT(p0 >= 0); + + const int64_t K_OC = a->ne[0]; + const int64_t T_in = a->ne[1]; + const int64_t K = K_OC / oc; + const int64_t T_out = (T_in - 1) * s0 + K - 2 * p0; + + GGML_ASSERT(K_OC == K * oc); // a->ne[0] must be a whole number of oc blocks + GGML_ASSERT(K > 0 && T_out > 0); + + const int64_t ne[4] = { T_out, oc, 1, 1 }; + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 2, ne); + + int32_t params[] = { s0, (int32_t)oc, (int32_t)p0 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_COL2IM_1D; + result->src[0] = a; + + return result; +} + // ggml_conv_transpose_1d static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 33ae3b303c..e284a58d1c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -265,6 +265,7 @@ if (NOT GGML_BACKEND_DL) llama_build_and_test(test-quantize-fns.cpp) llama_build_and_test(test-quantize-perf.cpp) llama_build_and_test(test-rope.cpp) + llama_build_and_test(test-col2im-1d.cpp) endif() # libmtmd diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f561d09b5b..c30b4e9815 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -5098,6 +5098,39 @@ struct test_conv_transpose_1d : public test_case { } }; +// GGML_OP_COL2IM_1D +struct test_col2im_1d : public test_case { + const ggml_type type; + const int64_t K; // kernel size + const int64_t OC; // output channels + const int64_t T_in; // input length (number of columns) + const int s0; // stride + const int p0; // padding cropped from both sides + + std::string vars() override { + return VARS_TO_STR6(type, K, OC, T_in, s0, p0); + } + + double max_nmse_err() override { + return type == GGML_TYPE_F32 ? 1e-7 : 5e-4; + } + + test_col2im_1d(ggml_type type = GGML_TYPE_F32, + int64_t K = 4, int64_t OC = 3, int64_t T_in = 7, + int s0 = 2, int p0 = 0) + : type(type), K(K), OC(OC), T_in(T_in), s0(s0), p0(p0) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * cols = ggml_new_tensor_2d(ctx, type, K*OC, T_in); + ggml_set_name(cols, "cols"); + + ggml_tensor * out = ggml_col2im_1d(ctx, cols, s0, (int) OC, p0); + ggml_set_name(out, "out"); + + return out; + } +}; + // GGML_OP_CONV_TRANSPOSE_2D struct test_conv_transpose_2d : public test_case { // Dimensions @@ -8013,6 +8046,21 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1)); test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1)); + for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16}) { + // ConvTranspose1d expressed as mul_mat + col2im (DAC decoder upsampling) + test_cases.emplace_back(new test_col2im_1d(type, 16, 32, 197, 8, 0)); // kernel = 2*stride + test_cases.emplace_back(new test_col2im_1d(type, 4, 3, 7, 2, 0)); + test_cases.emplace_back(new test_col2im_1d(type, 1, 5, 13, 1, 0)); // stride 1, no overlap + test_cases.emplace_back(new test_col2im_1d(type, 6, 4, 11, 3, 1)); // with cropping + test_cases.emplace_back(new test_col2im_1d(type, 2, 3, 9, 3, 0)); // kernel < stride, gap positions are zeroed + test_cases.emplace_back(new test_col2im_1d(type, 5, 4, 11, 2, 0)); // kernel not a multiple of stride, alternating overlap + test_cases.emplace_back(new test_col2im_1d(type, 8, 4, 13, 4, 2)); // padding = stride/2 (DAC causal cropping) + test_cases.emplace_back(new test_col2im_1d(type, 4, 3, 1, 2, 0)); // single column, pure kernel unfold + test_cases.emplace_back(new test_col2im_1d(type, 16, 1, 197, 8, 0)); // OC = 1, mono output stage + test_cases.emplace_back(new test_col2im_1d(type, 1, 5, 13, 3, 0)); // K = 1 with stride > 1, sparse scatter + test_cases.emplace_back(new test_col2im_1d(type, 8, 2, 3, 2, 5)); // cropping eats most of the signal, T_out = 2 + } + for (ggml_type kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) { test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1, kernel_type)); test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2, kernel_type)); @@ -9366,6 +9414,11 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2, kernel_type)); } + // Memory bound overlap-add of the GEMM + col2im_1d transposed conv path, real vocoder stage shapes + test_cases.emplace_back(new test_col2im_1d(GGML_TYPE_F32, 16, 512, 2048, 8, 0)); + test_cases.emplace_back(new test_col2im_1d(GGML_TYPE_F32, 4, 128, 65536, 2, 0)); + test_cases.emplace_back(new test_col2im_1d(GGML_TYPE_F16, 16, 512, 2048, 8, 0)); + test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1})); diff --git a/tests/test-col2im-1d.cpp b/tests/test-col2im-1d.cpp new file mode 100644 index 0000000000..f1d36479b3 --- /dev/null +++ b/tests/test-col2im-1d.cpp @@ -0,0 +1,159 @@ +// test-col2im-1d.cpp: validate GGML_OP_COL2IM_1D against ggml_conv_transpose_1d. +// +// A ConvTranspose1d factorizes as a GEMM followed by an overlap-add: +// conv_transpose_1d(w, x) equals col2im_1d(mul_mat(w_perm, x_t), s0, OC, p0) +// with w_perm the [IC, K*OC] permutation of the [K, OC, IC] kernel and x_t the +// [IC, T_in] transpose of the [T_in, IC] input. The test derives both alternative +// layouts from one logical weight and one logical input with graph ops only +// (permute + cont + reshape), runs the two paths on the CPU backend, and compares +// them in F32. The F16 and BF16 kernels are exercised by casting the column +// matrix before the scatter. Cropping (p0 > 0) is checked against the shifted +// slice of the uncropped reference, which conv_transpose_1d cannot express. + +#include "ggml.h" +#include "ggml-cpu.h" + +#include +#include +#include +#include +#include + +// One geometry: kernel size, output channels, input length, stride, crop +struct col2im_case { + int64_t K; + int64_t OC; + int64_t T_in; + int s0; + int p0; +}; + +// Mirrors the eval grid of test-backend-ops +static const col2im_case CASES[] = { + { 16, 32, 197, 8, 0 }, // kernel = 2*stride, DAC upsampling shape + { 4, 3, 7, 2, 0 }, + { 1, 5, 13, 1, 0 }, // stride 1, no overlap + { 6, 4, 11, 3, 1 }, // with cropping + { 2, 3, 9, 3, 0 }, // kernel < stride, gap positions are zeroed + { 5, 4, 11, 2, 0 }, // kernel not a multiple of stride, alternating overlap + { 8, 4, 13, 4, 2 }, // padding = stride/2, DAC causal cropping + { 4, 3, 1, 2, 0 }, // single column, pure kernel unfold + { 16, 1, 197, 8, 0 }, // OC = 1, mono output stage + { 1, 5, 13, 3, 0 }, // K = 1 with stride > 1, sparse scatter + { 8, 2, 3, 2, 5 }, // cropping eats most of the signal, T_out = 2 +}; + +// Input channels of the GEMM, shared by every case +static const int64_t IC = 7; + +// Deterministic LCG mapped to [-1, 1] +static uint64_t g_rng = 0x12345678ULL; +static float frand(void) { + g_rng = g_rng * 6364136223846793005ULL + 1442695040888963407ULL; + return (float)((g_rng >> 33) & 0xffffff) / (float)0x800000 - 1.0f; +} + +// Read a F32/F16/BF16 tensor back as a flat F32 vector +static std::vector tensor_to_f32(const struct ggml_tensor * t) { + const int64_t n = ggml_nelements(t); + std::vector out(n); + if (t->type == GGML_TYPE_F32) { + memcpy(out.data(), t->data, n * sizeof(float)); + } else if (t->type == GGML_TYPE_F16) { + for (int64_t i = 0; i < n; i++) { + out[i] = ggml_fp16_to_fp32(((const ggml_fp16_t *) t->data)[i]); + } + } else { + for (int64_t i = 0; i < n; i++) { + out[i] = ggml_bf16_to_fp32(((const ggml_bf16_t *) t->data)[i]); + } + } + return out; +} + +// NMSE of the cropped output against the p0 shifted slice of the full reference +static double nmse_cropped(const float * y, const float * ref, int64_t T_out, int64_t T_ref, int64_t OC, int p0) { + double num = 0.0; + double den = 0.0; + for (int64_t oc = 0; oc < OC; oc++) { + for (int64_t t = 0; t < T_out; t++) { + const double a = y [t + oc * T_out]; + const double b = ref[t + p0 + oc * T_ref]; + num += (a - b) * (a - b); + den += b * b; + } + } + return num / (den + 1e-30); +} + +int main(void) { + int fails = 0; + + for (const col2im_case & c : CASES) { + const int64_t T_ref = (c.T_in - 1) * c.s0 + c.K; + const int64_t T_out = T_ref - 2 * c.p0; + + struct ggml_init_params params = { + /* .mem_size = */ (size_t) 64 << 20, + /* .mem_base = */ NULL, + /* .no_alloc = */ false, + }; + struct ggml_context * ctx = ggml_init(params); + + // One logical weight and one logical input feed both paths + struct ggml_tensor * w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, c.K, c.OC, IC); + struct ggml_tensor * x = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c.T_in, IC); + for (int64_t i = 0; i < ggml_nelements(w); i++) { + ((float *) w->data)[i] = frand(); + } + for (int64_t i = 0; i < ggml_nelements(x); i++) { + ((float *) x->data)[i] = frand(); + } + + // Reference path: the native op, uncropped + struct ggml_tensor * y_ref = ggml_conv_transpose_1d(ctx, w, x, c.s0, 0, 1); + + // Decomposed path: [K, OC, IC] -> [IC, K, OC] -> [IC, K*OC], k fastest inside each oc block + struct ggml_tensor * w_perm = ggml_cont(ctx, ggml_permute(ctx, w, 1, 2, 0, 3)); + w_perm = ggml_reshape_2d(ctx, w_perm, IC, c.K * c.OC); + struct ggml_tensor * x_t = ggml_cont(ctx, ggml_transpose(ctx, x)); + struct ggml_tensor * col = ggml_mul_mat(ctx, w_perm, x_t); + struct ggml_tensor * y32 = ggml_col2im_1d(ctx, col, c.s0, (int) c.OC, c.p0); + + // Half precision kernels: the same columns cast before the scatter + struct ggml_tensor * y16 = ggml_col2im_1d(ctx, ggml_cast(ctx, col, GGML_TYPE_F16), c.s0, (int) c.OC, c.p0); + struct ggml_tensor * ybf = ggml_col2im_1d(ctx, ggml_cast(ctx, col, GGML_TYPE_BF16), c.s0, (int) c.OC, c.p0); + + GGML_ASSERT(y_ref->ne[0] == T_ref && y_ref->ne[1] == c.OC); + GGML_ASSERT(y32->ne[0] == T_out && y32->ne[1] == c.OC); + + struct ggml_cgraph * gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, y_ref); + ggml_build_forward_expand(gf, y32); + ggml_build_forward_expand(gf, y16); + ggml_build_forward_expand(gf, ybf); + ggml_graph_compute_with_ctx(ctx, gf, 4); + + const std::vector f32 = tensor_to_f32(y32); + const std::vector f16 = tensor_to_f32(y16); + const std::vector fbf = tensor_to_f32(ybf); + const float * ref = (const float *) y_ref->data; + + const double e32 = nmse_cropped(f32.data(), ref, T_out, T_ref, c.OC, c.p0); + const double e16 = nmse_cropped(f16.data(), ref, T_out, T_ref, c.OC, c.p0); + const double ebf = nmse_cropped(fbf.data(), ref, T_out, T_ref, c.OC, c.p0); + + // Same thresholds as test-backend-ops: 1e-7 full precision, 5e-4 half + const bool ok = e32 <= 1e-7 && e16 <= 5e-4 && ebf <= 5e-4; + if (!ok) { + fails++; + } + printf("col2im_1d K=%2d OC=%2d T_in=%3d s0=%d p0=%d: nmse f32=%.2e f16=%.2e bf16=%.2e %s\n", + (int) c.K, (int) c.OC, (int) c.T_in, c.s0, c.p0, e32, e16, ebf, ok ? "OK" : "FAIL"); + + ggml_free(ctx); + } + + printf(fails == 0 ? "all col2im_1d checks passed\n" : "%d col2im_1d checks FAILED\n", fails); + return fails == 0 ? 0 : 1; +}