diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 1e625cc1cb..eb5eb0eb4e 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -386,6 +386,46 @@ static void ggml_cpy_f32_iq4_nl_cuda( (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } +// check if a same-type copy reduces to a 2D strided copy (height rows of width +// contiguous bytes), so it can use cudaMemcpy2DAsync instead of the scalar kernel +static bool ggml_cuda_cpy_as_memcpy_2d(const ggml_tensor * src0, const ggml_tensor * src1, + size_t & width, size_t & height, size_t & spitch, size_t & dpitch) { + // require matching shape: a reshaped copy maps elements by flat order, which the + // prefix walk below does not handle + if (src0->type != src1->type || !ggml_are_same_shape(src0, src1)) { + return false; + } + + // grow the contiguous prefix block shared by both tensors + size_t block_nb = ggml_element_size(src0); + int d = 0; + for (; d < GGML_MAX_DIMS; ++d) { + if (src0->nb[d] != block_nb || src1->nb[d] != block_nb) { + break; + } + block_nb *= src0->ne[d]; + } + + // d == 0: nothing contiguous; d == GGML_MAX_DIMS: fully contiguous (handled by memcpy) + if (d == 0 || d == GGML_MAX_DIMS) { + return false; + } + + // dim d carries the rows; everything above it must be a single element + for (int i = d + 1; i < GGML_MAX_DIMS; ++i) { + if (src0->ne[i] != 1) { + return false; + } + } + + width = block_nb; + height = src0->ne[d]; + spitch = src0->nb[d]; + dpitch = src1->nb[d]; + + return spitch >= width && dpitch >= width; +} + void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) { const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); @@ -421,6 +461,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) && src0->ne[3] == 1 && nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0); + size_t mc_width = 0, mc_height = 0, mc_spitch = 0, mc_dpitch = 0; + if (src0->type == src1->type && contiguous_srcs) { GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1)); #if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY) @@ -431,6 +473,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg { CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream)); } + } else if (ggml_cuda_cpy_as_memcpy_2d(src0, src1, mc_width, mc_height, mc_spitch, mc_dpitch)) { + CUDA_CHECK(cudaMemcpy2DAsync(src1_ddc, mc_dpitch, src0_ddc, mc_spitch, + mc_width, mc_height, cudaMemcpyDeviceToDevice, main_stream)); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { if (can_be_transposed) { ggml_cpy_scalar_cuda diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 943aef8645..659dbd4b5a 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -1053,6 +1053,10 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { (op->ne[0] == 2 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2)) { return true; } + // CPY into a strided view of a larger buffer (recurrent-state snapshots) not supported + if (op->view_src && ggml_nbytes(op) != ggml_nbytes(op->view_src)) { + return true; + } break; } case GGML_OP_MUL_MAT: { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 09ac62a756..15b50209c8 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2890,12 +2890,17 @@ struct test_cpy : public test_case { const std::array ne_dst; const std::array permute_src; const std::array permute_dst; + const std::array dst_alloc; // if set, dst is a view into a larger buffer (strided) bool _src_use_permute; bool _dst_use_permute; bool _src_transpose; bool _use_dst_shape; + bool _use_dst_alloc; std::string vars() override { + if (_use_dst_alloc) { + return VARS_TO_STR8(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose, dst_alloc); + } if (_use_dst_shape) { return VARS_TO_STR7(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose); } @@ -2943,12 +2948,15 @@ struct test_cpy : public test_case { std::array ne_dst = {-1, -1, -1, -1}, std::array permute_src = {0, 0, 0, 0}, std::array permute_dst = {0, 0, 0, 0}, - bool transpose_src = false) + bool transpose_src = false, + std::array dst_alloc = {0, 0, 0, 0}) : type_src(type_src), type_dst(type_dst), ne_src(ne_src), ne_dst(ne_dst), permute_src(permute_src), permute_dst(permute_dst), + dst_alloc(dst_alloc), _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0), _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0), _src_transpose(transpose_src), - _use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0){} + _use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0), + _use_dst_alloc(dst_alloc[0] > 0){} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne_src.data()); @@ -2966,12 +2974,23 @@ struct test_cpy : public test_case { } std::array dst_ne = _use_dst_shape ? ne_dst : std::array{src->ne[0], src->ne[1], src->ne[2], src->ne[3]}; - ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data()); - ggml_set_name(dst, "dst"); + ggml_tensor * dst; - if (_dst_use_permute) { - dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]); - ggml_set_name(dst, "dst_permuted"); + if (_use_dst_alloc) { + // view a sub-block of a larger buffer -> strided dst + ggml_tensor * dst_buf = ggml_new_tensor(ctx, type_dst, 4, dst_alloc.data()); + ggml_set_name(dst_buf, "dst_buf"); + dst = ggml_view_4d(ctx, dst_buf, dst_ne[0], dst_ne[1], dst_ne[2], dst_ne[3], + dst_buf->nb[1], dst_buf->nb[2], dst_buf->nb[3], 0); + ggml_set_name(dst, "dst_view"); + } else { + dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data()); + ggml_set_name(dst, "dst"); + + if (_dst_use_permute) { + dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]); + ggml_set_name(dst, "dst_permuted"); + } } ggml_tensor * out = ggml_cpy(ctx, src, dst); @@ -8181,6 +8200,8 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2097121, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2, 524281, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {128, 2, 3, 1}, {128, 2, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, false, {128, 4, 3, 1})); // strided dst + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {128, 2, 3, 1}, {128, 2, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, false, {128, 4, 3, 1})); // strided dst // CPY - different src/dst shapes (reshaping via CPY) // Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360.