diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 409fea98..12140a25 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -3488,9 +3488,16 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor } } - //printf("%s: using limit = %g\n", __func__, limit); - ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(dst), - (const float *)dst->data, dst_up.get(), (float *)dst->data, limit); + auto unary_op = (ggml_unary_op)dst->op_params[0]; + if (unary_op == GGML_UNARY_OP_SWIGLU_OAI) { + ggml_swiglu_oai_cuda_f32((const float *)dst->data, (const float *)dst_up.get(), + (float *)dst->data, ggml_nelements(dst), dst->ne[0], dst->ne[0], dst->ne[0], + 1.702f, 7.0f, stream); + } else { + //printf("%s: using limit = %g\n", __func__, limit); + ggml_fused_mul_unary(ctx, unary_op, ggml_nelements(dst), + (const float *)dst->data, dst_up.get(), (float *)dst->data, limit); + } CUDA_CHECK(cudaGetLastError()); } diff --git a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh index ccf5fc67..426a2b09 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh +++ b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh @@ -190,6 +190,13 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel( float u = tmp_u[j][threadIdx.x]; float g = tmp_g[j][threadIdx.x]; float r; + if (unary_op == GGML_UNARY_OP_SWIGLU_OAI && !bias_u) { + constexpr float alpha = 1.702f; + constexpr float limit = 7.0f; + g = fminf(g, limit); + u = fmaxf(fminf(u, limit), -limit); + r = g / (1.0f + expf(-g * alpha)) * (1.0f + u); + } else { switch (unary_op) { case GGML_UNARY_OP_SILU: { @@ -214,6 +221,7 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel( r = g / (1.0f + expf(-g * alpha)) * (1.0f + u); } break; } + } dst[j*nrows_dst + row0 + threadIdx.x] = r; } } diff --git a/ggml/src/ggml-cuda/mmvq-templates.cuh b/ggml/src/ggml-cuda/mmvq-templates.cuh index d9bc431b..3948ad73 100644 --- a/ggml/src/ggml-cuda/mmvq-templates.cuh +++ b/ggml/src/ggml-cuda/mmvq-templates.cuh @@ -242,6 +242,13 @@ static __device__ void k_fused_mul_mat_vec_q( float u = tmp_u[j][threadIdx.x]; float g = tmp_g[j][threadIdx.x]; float r; + if (unary_op == GGML_UNARY_OP_SWIGLU_OAI && !bias_u) { + constexpr float alpha = 1.702f; + constexpr float limit = 7.0f; + g = fminf(g, limit); + u = fmaxf(fminf(u, limit), -limit); + r = g / (1.0f + expf(-g * alpha)) * (1.0f + u); + } else { switch (unary_op) { case GGML_UNARY_OP_SILU: { @@ -266,6 +273,7 @@ static __device__ void k_fused_mul_mat_vec_q( r = g / (1.0f + expf(-g * alpha)) * (1.0f + u); } break; } + } dst[j*nrows_dst + row0 + threadIdx.x] = r; } } diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index afc2336f..2c41aca3 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -262,7 +262,8 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx, if (!bias_u && !bias_g) { GGML_ASSERT(unary_op == GGML_UNARY_OP_SILU || unary_op == GGML_UNARY_OP_RELU || - unary_op == GGML_UNARY_OP_GELU); + unary_op == GGML_UNARY_OP_GELU || + unary_op == GGML_UNARY_OP_SWIGLU_OAI); } else { GGML_ASSERT(unary_op == GGML_UNARY_OP_SWIGLU_OAI); GGML_ASSERT(bias_u && bias_g); diff --git a/src/graphs/build_minimaxm3.cpp b/src/graphs/build_minimaxm3.cpp index c0a1409e..388ae75d 100644 --- a/src/graphs/build_minimaxm3.cpp +++ b/src/graphs/build_minimaxm3.cpp @@ -47,7 +47,7 @@ ggml_cgraph* llm_build_context::build_minimaxm3() { model.layers[il].ffn_down_shexp, nullptr, n_expert, n_expert_used, - LLM_FFN_SWIGLU_OAI_MOE, + LLM_FFN_SWIGLU_OAI, hparams.expert_weights_norm, hparams.expert_weights_scale != 0.0f, hparams.expert_weights_scale, (llm_expert_gating_func_type) hparams.expert_gating_func, diff --git a/src/graphs/build_openai.cpp b/src/graphs/build_openai.cpp index 5d96e8df..970f91ad 100644 --- a/src/graphs/build_openai.cpp +++ b/src/graphs/build_openai.cpp @@ -43,9 +43,9 @@ ggml_cgraph * llm_build_context::build_openai_moe() { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // no shared experts n_expert, n_expert_used, - LLM_FFN_SWIGLU_OAI_MOE, false, false, 0.0f, + LLM_FFN_SWIGLU_OAI, false, false, 0.0f, LLM_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT, - LLM_FFN_SWIGLU_OAI_MOE, cb, il, gf, true, + LLM_FFN_SWIGLU_OAI, cb, il, gf, true, model.layers[il].ffn_up_gate_exps, model.layers[il].ffn_up_gate_exps_b); cur = lctx.cvec.apply_to(ctx0, cur, il); diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 17a06b1a..32166e82 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -755,9 +755,10 @@ ggml_tensor * llm_build_context::llm_build_ffn( if (!up_b && !up_s && !gate_b && !gate_s && !down_b && !down_s && up->extra && gate->extra && down->extra && type_gate == LLM_FFN_PAR && - (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) { + (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || type_op == LLM_FFN_SWIGLU_OAI || (type_op == LLM_FFN_GELU && !act_scales))) { auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : - type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU; + type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : + type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI; auto u = (ggml_split_tensor_t *)up->extra; auto g = (ggml_split_tensor_t *)gate->extra; auto d = (ggml_split_tensor_t *)down->extra; @@ -833,9 +834,10 @@ ggml_tensor * llm_build_context::llm_build_ffn( if (lctx.cparams.fused_up_gate && up && gate && !up_b && !up_s && !gate_b && !gate_s && type_gate == LLM_FFN_PAR && - (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) { + (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || type_op == LLM_FFN_SWIGLU_OAI || (type_op == LLM_FFN_GELU && !act_scales))) { auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : - type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU; + type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : + type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI; cur = ggml_fused_up_gate(ctx, up, gate, cur, unary_op); cb(cur, "ffn_up_gate", il); if (lctx.model.arch == LLM_ARCH_STEP35) { @@ -1153,16 +1155,16 @@ llm_expert_gating_func_type gating_op, // Hence, if we have biases, we cannot use fmoe. // //bool can_use_fmoe = !up_exps_b && !gate_exps_b && (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU); - bool can_use_fmoe = (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI_MOE); + bool can_use_fmoe = (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI); ggml_tensor * par; if (can_use_fmoe && up_gate_exps) { - if (up_gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI_MOE) { + if (up_gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI) { par = ggml_moe_up_gate_ext(ctx, up_gate_exps, nullptr, cur, selected_experts, up_gate_exps_b, nullptr, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI); } else { - GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE); + GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI); par = ggml_moe_up_gate(ctx, up_gate_exps, nullptr, cur, selected_experts, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU); } @@ -1173,12 +1175,12 @@ llm_expert_gating_func_type gating_op, GGML_ASSERT(!up_gate_exps && !up_gate_exps_b); if (can_use_fmoe && lctx.cparams.fused_moe_up_gate && up_exps->type == gate_exps->type) { - if (up_exps_b || gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI_MOE) { + if (up_exps_b || gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI) { par = ggml_moe_up_gate_ext(ctx, up_exps, gate_exps, cur, selected_experts, up_exps_b, gate_exps_b, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI); } else { - GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE); + GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI); par = ggml_moe_up_gate(ctx, up_exps, gate_exps, cur, selected_experts, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU); } @@ -1213,7 +1215,7 @@ llm_expert_gating_func_type gating_op, if (lctx.model.arch == LLM_ARCH_STEP35) { *((float *)(par->op_params + 1)) = lctx.model.hparams.swiglu_limits[il]; } - } else if (type_op == LLM_FFN_SWIGLU_OAI_MOE) { + } else if (type_op == LLM_FFN_SWIGLU_OAI) { constexpr float alpha = 1.702f; constexpr float limit = 7.0f; par = ggml_swiglu_oai(ctx, gate, up, alpha, limit); diff --git a/src/llama-build-context.h b/src/llama-build-context.h index aeadbbed..b9a1b1d4 100644 --- a/src/llama-build-context.h +++ b/src/llama-build-context.h @@ -25,7 +25,6 @@ enum llm_ffn_op_type { LLM_FFN_RELU_SQR, LLM_FFN_SWIGLU, LLM_FFN_SWIGLU_OAI, - LLM_FFN_SWIGLU_OAI_MOE, }; enum llm_ffn_gate_type {