diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 409fea98..12140a25 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -3488,9 +3488,16 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
         }
     }
 
-    //printf("%s: using limit = %g\n", __func__, limit);
-    ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(dst),
-                    (const float *)dst->data, dst_up.get(), (float *)dst->data, limit);
+    auto unary_op = (ggml_unary_op)dst->op_params[0];
+    if (unary_op == GGML_UNARY_OP_SWIGLU_OAI) {
+        ggml_swiglu_oai_cuda_f32((const float *)dst->data, (const float *)dst_up.get(),
+                (float *)dst->data, ggml_nelements(dst), dst->ne[0],  dst->ne[0],  dst->ne[0],
+                1.702f, 7.0f, stream);
+    } else {
+        //printf("%s: using limit = %g\n", __func__, limit);
+        ggml_fused_mul_unary(ctx, unary_op, ggml_nelements(dst),
+                (const float *)dst->data, dst_up.get(), (float *)dst->data, limit);
+    }
     CUDA_CHECK(cudaGetLastError());
 
 }
diff --git a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh
index ccf5fc67..426a2b09 100644
--- a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh
+++ b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh
@@ -190,6 +190,13 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel(
             float u = tmp_u[j][threadIdx.x];
             float g = tmp_g[j][threadIdx.x];
             float r;
+            if (unary_op == GGML_UNARY_OP_SWIGLU_OAI && !bias_u) {
+                constexpr float alpha = 1.702f;
+                constexpr float limit = 7.0f;
+                g = fminf(g, limit);
+                u = fmaxf(fminf(u, limit), -limit);
+                r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
+            } else {
             switch (unary_op) {
                 case GGML_UNARY_OP_SILU:
                     {
@@ -214,6 +221,7 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel(
                     r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
                 } break;
             }
+            }
             dst[j*nrows_dst + row0 + threadIdx.x] = r;
         }
     }
diff --git a/ggml/src/ggml-cuda/mmvq-templates.cuh b/ggml/src/ggml-cuda/mmvq-templates.cuh
index d9bc431b..3948ad73 100644
--- a/ggml/src/ggml-cuda/mmvq-templates.cuh
+++ b/ggml/src/ggml-cuda/mmvq-templates.cuh
@@ -242,6 +242,13 @@ static __device__ void k_fused_mul_mat_vec_q(
             float u = tmp_u[j][threadIdx.x];
             float g = tmp_g[j][threadIdx.x];
             float r;
+            if (unary_op == GGML_UNARY_OP_SWIGLU_OAI && !bias_u) {
+                constexpr float alpha = 1.702f;
+                constexpr float limit = 7.0f;
+                g = fminf(g, limit);
+                u = fmaxf(fminf(u, limit), -limit);
+                r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
+            } else {
             switch (unary_op) {
                 case GGML_UNARY_OP_SILU:
                     {
@@ -266,6 +273,7 @@ static __device__ void k_fused_mul_mat_vec_q(
                   r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
                 } break;
             }
+            }
             dst[j*nrows_dst + row0 + threadIdx.x] = r;
         }
     }
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index afc2336f..2c41aca3 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -262,7 +262,8 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx,
     if (!bias_u && !bias_g) {
         GGML_ASSERT(unary_op == GGML_UNARY_OP_SILU ||
                     unary_op == GGML_UNARY_OP_RELU ||
-                    unary_op == GGML_UNARY_OP_GELU);
+                    unary_op == GGML_UNARY_OP_GELU ||
+                    unary_op == GGML_UNARY_OP_SWIGLU_OAI);
     } else {
         GGML_ASSERT(unary_op == GGML_UNARY_OP_SWIGLU_OAI);
         GGML_ASSERT(bias_u && bias_g);
diff --git a/src/graphs/build_minimaxm3.cpp b/src/graphs/build_minimaxm3.cpp
index c0a1409e..388ae75d 100644
--- a/src/graphs/build_minimaxm3.cpp
+++ b/src/graphs/build_minimaxm3.cpp
@@ -47,7 +47,7 @@ ggml_cgraph* llm_build_context::build_minimaxm3() {
                     model.layers[il].ffn_down_shexp,
                     nullptr,
                     n_expert, n_expert_used,
-                    LLM_FFN_SWIGLU_OAI_MOE,
+                    LLM_FFN_SWIGLU_OAI,
                     hparams.expert_weights_norm,
                     hparams.expert_weights_scale != 0.0f, hparams.expert_weights_scale,
                     (llm_expert_gating_func_type) hparams.expert_gating_func,
diff --git a/src/graphs/build_openai.cpp b/src/graphs/build_openai.cpp
index 5d96e8df..970f91ad 100644
--- a/src/graphs/build_openai.cpp
+++ b/src/graphs/build_openai.cpp
@@ -43,9 +43,9 @@ ggml_cgraph * llm_build_context::build_openai_moe() {
                 nullptr,
                 nullptr,  nullptr, nullptr,  nullptr, nullptr,  nullptr, // no shared experts
                 n_expert, n_expert_used,
-                LLM_FFN_SWIGLU_OAI_MOE, false, false, 0.0f,
+                LLM_FFN_SWIGLU_OAI, false, false, 0.0f,
                 LLM_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
-                LLM_FFN_SWIGLU_OAI_MOE, cb, il, gf, true,
+                LLM_FFN_SWIGLU_OAI, cb, il, gf, true,
                 model.layers[il].ffn_up_gate_exps, model.layers[il].ffn_up_gate_exps_b);
 
         cur = lctx.cvec.apply_to(ctx0, cur, il);
diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index 17a06b1a..32166e82 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -755,9 +755,10 @@ ggml_tensor * llm_build_context::llm_build_ffn(
 
     if (!up_b && !up_s && !gate_b && !gate_s && !down_b && !down_s &&
         up->extra && gate->extra && down->extra && type_gate == LLM_FFN_PAR &&
-        (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) {
+        (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || type_op == LLM_FFN_SWIGLU_OAI || (type_op == LLM_FFN_GELU && !act_scales))) {
         auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
-                        type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU;
+                        type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU :
+                        type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI;
         auto u = (ggml_split_tensor_t *)up->extra;
         auto g = (ggml_split_tensor_t *)gate->extra;
         auto d = (ggml_split_tensor_t *)down->extra;
@@ -833,9 +834,10 @@ ggml_tensor * llm_build_context::llm_build_ffn(
 
     if (lctx.cparams.fused_up_gate &&
         up && gate && !up_b && !up_s && !gate_b && !gate_s && type_gate == LLM_FFN_PAR &&
-        (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) {
+        (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || type_op == LLM_FFN_SWIGLU_OAI || (type_op == LLM_FFN_GELU && !act_scales))) {
         auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
-                        type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU;
+                        type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU :
+                        type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI;
         cur = ggml_fused_up_gate(ctx, up, gate, cur, unary_op);
         cb(cur, "ffn_up_gate", il);
         if (lctx.model.arch == LLM_ARCH_STEP35) {
@@ -1153,16 +1155,16 @@ llm_expert_gating_func_type   gating_op,
     // Hence, if we have biases, we cannot use fmoe.
     //
     //bool can_use_fmoe = !up_exps_b && !gate_exps_b && (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU);
-    bool can_use_fmoe = (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI_MOE);
+    bool can_use_fmoe = (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI);
 
     ggml_tensor * par;
     if (can_use_fmoe && up_gate_exps) {
-        if (up_gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI_MOE) {
+        if (up_gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI) {
             par = ggml_moe_up_gate_ext(ctx, up_gate_exps, nullptr, cur, selected_experts, up_gate_exps_b, nullptr,
                     type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
                     type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI);
         } else {
-            GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE);
+            GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI);
             par = ggml_moe_up_gate(ctx, up_gate_exps, nullptr, cur, selected_experts,
                     type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
         }
@@ -1173,12 +1175,12 @@ llm_expert_gating_func_type   gating_op,
     GGML_ASSERT(!up_gate_exps && !up_gate_exps_b);
 
     if (can_use_fmoe && lctx.cparams.fused_moe_up_gate && up_exps->type == gate_exps->type) {
-        if (up_exps_b || gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI_MOE) {
+        if (up_exps_b || gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI) {
             par = ggml_moe_up_gate_ext(ctx, up_exps, gate_exps, cur, selected_experts, up_exps_b, gate_exps_b,
                     type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
                     type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI);
         } else {
-            GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE);
+            GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI);
             par = ggml_moe_up_gate(ctx, up_exps, gate_exps, cur, selected_experts,
                     type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
         }
@@ -1213,7 +1215,7 @@ llm_expert_gating_func_type   gating_op,
             if (lctx.model.arch == LLM_ARCH_STEP35) {
                 *((float *)(par->op_params + 1)) = lctx.model.hparams.swiglu_limits[il];
             }
-        } else if (type_op == LLM_FFN_SWIGLU_OAI_MOE) {
+        } else if (type_op == LLM_FFN_SWIGLU_OAI) {
             constexpr float alpha = 1.702f;
             constexpr float limit = 7.0f;
             par = ggml_swiglu_oai(ctx, gate, up, alpha, limit);
diff --git a/src/llama-build-context.h b/src/llama-build-context.h
index aeadbbed..b9a1b1d4 100644
--- a/src/llama-build-context.h
+++ b/src/llama-build-context.h
@@ -25,7 +25,6 @@ enum llm_ffn_op_type {
     LLM_FFN_RELU_SQR,
     LLM_FFN_SWIGLU,
     LLM_FFN_SWIGLU_OAI,
-    LLM_FFN_SWIGLU_OAI_MOE,
 };
 
 enum llm_ffn_gate_type {